diff --git a/pkg/storage/apply/task.go b/pkg/storage/apply/task.go index 8bfa62acd0a1..26e9ef9511dd 100644 --- a/pkg/storage/apply/task.go +++ b/pkg/storage/apply/task.go @@ -12,6 +12,7 @@ package apply import ( "context" + "errors" "go.etcd.io/etcd/raft/raftpb" ) @@ -54,6 +55,10 @@ type StateMachine interface { ApplySideEffects(CheckedCommand) (AppliedCommand, error) } +// ErrRemoved can be returned from ApplySideEffects which will stop the +// task from processing more commands and return immediately. +var ErrRemoved = errors.New("replica removed") + // Batch accumulates a series of updates from Commands and performs them // all at once to its StateMachine when applied. Groups of Commands will be // staged in the Batch such that one or more trivial Commands are staged or diff --git a/pkg/storage/client_merge_test.go b/pkg/storage/client_merge_test.go index f6073a23ec68..f3ee1338ae62 100644 --- a/pkg/storage/client_merge_test.go +++ b/pkg/storage/client_merge_test.go @@ -57,7 +57,6 @@ import ( "github.com/pkg/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "go.etcd.io/etcd/raft" "go.etcd.io/etcd/raft/raftpb" ) @@ -1638,6 +1637,7 @@ func TestStoreReplicaGCAfterMerge(t *testing.T) { storeCfg.TestingKnobs.DisableReplicateQueue = true storeCfg.TestingKnobs.DisableReplicaGCQueue = true storeCfg.TestingKnobs.DisableMergeQueue = true + storeCfg.TestingKnobs.DisableEagerReplicaRemoval = true mtc := &multiTestContext{storeConfig: &storeCfg} mtc.Start(t, 2) defer mtc.Stop() @@ -1662,10 +1662,10 @@ func TestStoreReplicaGCAfterMerge(t *testing.T) { for _, rangeID := range []roachpb.RangeID{lhsDesc.RangeID, rhsDesc.RangeID} { repl, err := store1.GetReplica(rangeID) if err != nil { - t.Fatal(err) + continue } if err := store1.ManualReplicaGC(repl); err != nil { - t.Fatal(err) + t.Logf("replica was already removed: %v", err) } if _, err := store1.GetReplica(rangeID); err == nil { t.Fatalf("replica of r%d not gc'd from s1", rangeID) @@ -2041,6 +2041,7 @@ func TestStoreRangeMergeAbandonedFollowers(t *testing.T) { storeCfg.TestingKnobs.DisableReplicaGCQueue = true storeCfg.TestingKnobs.DisableSplitQueue = true storeCfg.TestingKnobs.DisableMergeQueue = true + storeCfg.TestingKnobs.DisableEagerReplicaRemoval = true mtc := &multiTestContext{storeConfig: &storeCfg} mtc.Start(t, 3) defer mtc.Stop() @@ -2808,74 +2809,6 @@ func TestStoreRangeMergeSlowWatcher(t *testing.T) { } } -// unreliableRaftHandler drops all Raft messages that are addressed to the -// specified rangeID, but lets all other messages through. -type unreliableRaftHandler struct { - rangeID roachpb.RangeID - storage.RaftMessageHandler - // If non-nil, can return false to avoid dropping a msg to rangeID - dropReq func(*storage.RaftMessageRequest) bool - dropHB func(*storage.RaftHeartbeat) bool - dropResp func(*storage.RaftMessageResponse) bool -} - -func (h *unreliableRaftHandler) HandleRaftRequest( - ctx context.Context, - req *storage.RaftMessageRequest, - respStream storage.RaftMessageResponseStream, -) *roachpb.Error { - if len(req.Heartbeats)+len(req.HeartbeatResps) > 0 { - reqCpy := *req - req = &reqCpy - req.Heartbeats = h.filterHeartbeats(req.Heartbeats) - req.HeartbeatResps = h.filterHeartbeats(req.HeartbeatResps) - if len(req.Heartbeats)+len(req.HeartbeatResps) == 0 { - // Entirely filtered. - return nil - } - } else if req.RangeID == h.rangeID { - if h.dropReq == nil || h.dropReq(req) { - log.Infof( - ctx, - "dropping Raft message %s", - raft.DescribeMessage(req.Message, func([]byte) string { - return "" - }), - ) - - return nil - } - } - return h.RaftMessageHandler.HandleRaftRequest(ctx, req, respStream) -} - -func (h *unreliableRaftHandler) filterHeartbeats( - hbs []storage.RaftHeartbeat, -) []storage.RaftHeartbeat { - if len(hbs) == 0 { - return hbs - } - var cpy []storage.RaftHeartbeat - for i := range hbs { - hb := &hbs[i] - if hb.RangeID != h.rangeID || (h.dropHB != nil && !h.dropHB(hb)) { - cpy = append(cpy, *hb) - } - } - return cpy -} - -func (h *unreliableRaftHandler) HandleRaftResponse( - ctx context.Context, resp *storage.RaftMessageResponse, -) error { - if resp.RangeID == h.rangeID { - if h.dropResp == nil || h.dropResp(resp) { - return nil - } - } - return h.RaftMessageHandler.HandleRaftResponse(ctx, resp) -} - func TestStoreRangeMergeRaftSnapshot(t *testing.T) { defer leaktest.AfterTest(t)() @@ -3353,9 +3286,12 @@ func TestMergeQueue(t *testing.T) { t.Run("non-collocated", func(t *testing.T) { reset(t) verifyUnmerged(t) - mtc.replicateRange(rhs().RangeID, 1) - mtc.transferLease(ctx, rhs().RangeID, 0, 1) - mtc.unreplicateRange(rhs().RangeID, 0) + rhsRangeID := rhs().RangeID + mtc.replicateRange(rhsRangeID, 1) + mtc.transferLease(ctx, rhsRangeID, 0, 1) + mtc.unreplicateRange(rhsRangeID, 0) + require.NoError(t, mtc.waitForUnreplicated(rhsRangeID, 0)) + clearRange(t, lhsStartKey, rhsEndKey) store.MustForceMergeScanAndProcess() verifyMerged(t) diff --git a/pkg/storage/client_metrics_test.go b/pkg/storage/client_metrics_test.go index 4fd43fcbcc21..c91d386c9e81 100644 --- a/pkg/storage/client_metrics_test.go +++ b/pkg/storage/client_metrics_test.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/pkg/errors" + "github.com/stretchr/testify/require" ) func checkGauge(t *testing.T, id string, g *metric.Gauge, e int64) { @@ -313,8 +314,9 @@ func TestStoreMetrics(t *testing.T) { return mtc.unreplicateRangeNonFatal(replica.RangeID, 0) }) - // Force GC Scan on store 0 in order to fully remove range. - mtc.stores[1].MustForceReplicaGCScanAndProcess() + // Wait until we're sure that store 0 has successfully processed its removal. + require.NoError(t, mtc.waitForUnreplicated(replica.RangeID, 0)) + mtc.waitForValues(roachpb.Key("z"), []int64{0, 5, 5}) // Verify range count is as expected. diff --git a/pkg/storage/client_raft_helpers_test.go b/pkg/storage/client_raft_helpers_test.go new file mode 100644 index 000000000000..2e05bd9c4602 --- /dev/null +++ b/pkg/storage/client_raft_helpers_test.go @@ -0,0 +1,115 @@ +// Copyright 2019 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package storage_test + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/storage" + "github.com/cockroachdb/cockroach/pkg/util/log" + "go.etcd.io/etcd/raft" +) + +// unreliableRaftHandler drops all Raft messages that are addressed to the +// specified rangeID, but lets all other messages through. +type unreliableRaftHandler struct { + rangeID roachpb.RangeID + storage.RaftMessageHandler + // If non-nil, can return false to avoid dropping a msg to rangeID + dropReq func(*storage.RaftMessageRequest) bool + dropHB func(*storage.RaftHeartbeat) bool + dropResp func(*storage.RaftMessageResponse) bool +} + +func (h *unreliableRaftHandler) HandleRaftRequest( + ctx context.Context, + req *storage.RaftMessageRequest, + respStream storage.RaftMessageResponseStream, +) *roachpb.Error { + if len(req.Heartbeats)+len(req.HeartbeatResps) > 0 { + reqCpy := *req + req = &reqCpy + req.Heartbeats = h.filterHeartbeats(req.Heartbeats) + req.HeartbeatResps = h.filterHeartbeats(req.HeartbeatResps) + if len(req.Heartbeats)+len(req.HeartbeatResps) == 0 { + // Entirely filtered. + return nil + } + } else if req.RangeID == h.rangeID { + if h.dropReq == nil || h.dropReq(req) { + log.Infof( + ctx, + "dropping Raft message %s", + raft.DescribeMessage(req.Message, func([]byte) string { + return "" + }), + ) + + return nil + } + } + return h.RaftMessageHandler.HandleRaftRequest(ctx, req, respStream) +} + +func (h *unreliableRaftHandler) filterHeartbeats( + hbs []storage.RaftHeartbeat, +) []storage.RaftHeartbeat { + if len(hbs) == 0 { + return hbs + } + var cpy []storage.RaftHeartbeat + for i := range hbs { + hb := &hbs[i] + if hb.RangeID != h.rangeID || (h.dropHB != nil && !h.dropHB(hb)) { + cpy = append(cpy, *hb) + } + } + return cpy +} + +func (h *unreliableRaftHandler) HandleRaftResponse( + ctx context.Context, resp *storage.RaftMessageResponse, +) error { + if resp.RangeID == h.rangeID { + if h.dropResp == nil || h.dropResp(resp) { + return nil + } + } + return h.RaftMessageHandler.HandleRaftResponse(ctx, resp) +} + +// mtcStoreRaftMessageHandler exists to allows a store to be stopped and +// restarted while maintaining a partition using an unreliableRaftHandler. +type mtcStoreRaftMessageHandler struct { + mtc *multiTestContext + storeIdx int +} + +func (h *mtcStoreRaftMessageHandler) HandleRaftRequest( + ctx context.Context, + req *storage.RaftMessageRequest, + respStream storage.RaftMessageResponseStream, +) *roachpb.Error { + return h.mtc.Store(h.storeIdx).HandleRaftRequest(ctx, req, respStream) +} + +func (h *mtcStoreRaftMessageHandler) HandleRaftResponse( + ctx context.Context, resp *storage.RaftMessageResponse, +) error { + return h.mtc.Store(h.storeIdx).HandleRaftResponse(ctx, resp) +} + +func (h *mtcStoreRaftMessageHandler) HandleSnapshot( + header *storage.SnapshotRequest_Header, respStream storage.SnapshotResponseStream, +) error { + return h.mtc.Store(h.storeIdx).HandleSnapshot(header, respStream) +} diff --git a/pkg/storage/client_raft_test.go b/pkg/storage/client_raft_test.go index 12335fcc96ea..ab9108e2ffe1 100644 --- a/pkg/storage/client_raft_test.go +++ b/pkg/storage/client_raft_test.go @@ -30,12 +30,14 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/rpc" "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" + "github.com/cockroachdb/cockroach/pkg/server" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/engine" "github.com/cockroachdb/cockroach/pkg/storage/storagebase" "github.com/cockroachdb/cockroach/pkg/storage/storagepb" "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" "github.com/cockroachdb/cockroach/pkg/util/hlc" @@ -1192,15 +1194,12 @@ func TestReplicateAfterRemoveAndSplit(t *testing.T) { return err } - if err := replicateRHS(); !testutils.IsError(err, storage.IntersectingSnapshotMsg) { - t.Fatalf("unexpected error %v", err) - } - - // Enable the replica GC queue so that the next attempt to replicate the RHS - // to store 2 will cause the obsolete replica to be GC'd allowing a - // subsequent replication to succeed. - mtc.stores[2].SetReplicaGCQueueActive(true) - + // This used to fail with IntersectingSnapshotMsg because we relied on replica + // GC to remove the LHS and that queue is disabled. Now we will detect that + // the LHS is not part of the range because of a ReplicaTooOldError and then + // we'll replicaGC the LHS in response. + // TODO(ajwerner): filter the reponses to node 2 or disable this eager + // replicaGC. testutils.SucceedsSoon(t, replicateRHS) } @@ -3000,6 +2999,52 @@ func TestReplicateRogueRemovedNode(t *testing.T) { defer mtc.Stop() mtc.Start(t, 3) + // We're going to set up the cluster with partitioning so that we can + // partition node 0 from the others. We do this by installing + // unreliableRaftHandler listeners on all three Stores which we can enable + // and disable with an atomic. The handler on the partitioned store filters + // out all messages while the handler on the other two stores only filters + // out messages from the partitioned store. When activated the configuration + // looks like: + // + // [0] + // x x + // / \ + // x x + // [1]<---->[2] + const partStore = 0 + var partitioned atomic.Value + partitioned.Store(false) + partRepl, err := mtc.stores[partStore].GetReplica(1) + if err != nil { + t.Fatal(err) + } + partReplDesc, err := partRepl.GetReplicaDescriptor() + if err != nil { + t.Fatal(err) + } + for _, s := range []int{0, 1, 2} { + s := s + h := &unreliableRaftHandler{ + rangeID: 1, + RaftMessageHandler: &mtcStoreRaftMessageHandler{ + mtc: mtc, + storeIdx: s, + }, + } + // Only filter messages from the partitioned store on the other + // two stores. + h.dropReq = func(req *storage.RaftMessageRequest) bool { + return partitioned.Load().(bool) && + (s == partStore || req.FromReplica.StoreID == partRepl.StoreID()) + } + h.dropHB = func(hb *storage.RaftHeartbeat) bool { + return partitioned.Load().(bool) && + (s == partStore || hb.FromReplicaID == partReplDesc.ReplicaID) + } + mtc.transport.Listen(mtc.stores[s].Ident.StoreID, h) + } + // First put the range on all three nodes. raftID := roachpb.RangeID(1) mtc.replicateRange(raftID, 1, 2) @@ -3044,7 +3089,9 @@ func TestReplicateRogueRemovedNode(t *testing.T) { } return nil }) - + // Partition nodes 1 and 2 from node 0. Otherwise they'd get a + // ReplicaTooOldError from node 0 and proceed to remove themselves. + partitioned.Store(true) // Bring node 2 back up. mtc.restartStore(2) @@ -3547,6 +3594,7 @@ func TestRemoveRangeWithoutGC(t *testing.T) { sc := storage.TestStoreConfig(nil) sc.TestingKnobs.DisableReplicaGCQueue = true + sc.TestingKnobs.DisableEagerReplicaRemoval = true mtc := &multiTestContext{storeConfig: &sc} defer mtc.Stop() mtc.Start(t, 2) @@ -3563,18 +3611,19 @@ func TestRemoveRangeWithoutGC(t *testing.T) { if err != nil { return err } - desc := rep.Desc() - if len(desc.InternalReplicas) != 1 { - return errors.Errorf("range has %d replicas", len(desc.InternalReplicas)) + if _, err := rep.IsDestroyed(); err == nil { + return errors.Errorf("range is still alive") } return nil }) // The replica's data is still on disk. + // We use an inconsistent scan because there's going to be an intent on the + // range descriptor to remove this replica. var desc roachpb.RangeDescriptor descKey := keys.RangeDescriptorKey(roachpb.RKeyMin) if ok, err := engine.MVCCGetProto(context.Background(), mtc.stores[0].Engine(), descKey, - mtc.stores[0].Clock().Now(), &desc, engine.MVCCGetOptions{}); err != nil { + mtc.stores[0].Clock().Now(), &desc, engine.MVCCGetOptions{Inconsistent: true}); err != nil { t.Fatal(err) } else if !ok { t.Fatal("expected range descriptor to be present") @@ -4595,3 +4644,197 @@ func TestAckWriteBeforeApplication(t *testing.T) { }) } } + +// TestProcessSplitAfterRightHandSideHasBeenRemoved tests cases where we have +// a follower replica which has received information about the RHS of a split +// before it has processed that split. The replica can't both have an +// initialized RHS and process the split but it can have an uninitialized RHS +// with a higher replica ID than lives in the split and it can have a RHS with +// an unknown replica ID and a tombstone at exactly the replica ID of the RHS. +// +// Starting in 19.2 if a replica discovers from a raft message that it is an +// old replica then it knows that it has been removed and re-added to the range. +// In this case the Replica eagerly destroys itself and its data. +// +// Given this behavior there are 2 troubling cases with regards to splits. +// +// * In all cases we begin with the s1 processing a presplit snapshot for r20. +// Let's assume after the split the store should have r21/3. +// +// * Store 1 receives a message for r21/3 prior to acquiring the split lock +// in r21. This will create an uninitialized r21/3 which may write +// HardState. +// +// * Before the r20 processes the split r21 is removed and re-added to +// s1 as r21/4. s1 receives a raft message destined for r21/4 and proceeds +// to destroy its uninitialized r21/3, laying down a tombstone at 4 in the +// process. +// +// 1) s1 processes the split and finds the RHS to be an uninitialized replica +// with a higher replica ID. +// +// 2) s1 crashes before processing the split, forgetting the replica ID of the +// RHS. +// +// In both cases we should not synthesize a HardState for the RHS as it would +// contain a non-zero Commit index which the RHS, which is known to be a later +// replica than the split, should not have until it has received a snapshot. +// Furthermore we must destroy the data that the RHS would have inherited. +// See https://github.com/cockroachdb/cockroach/issues/40470. +func TestProcessSplitAfterRightHandSideHasBeenRemoved(t *testing.T) { + defer leaktest.AfterTest(t)() + + t.Skip("still WIP") + sc := storage.TestStoreConfig(nil) + // Newly-started stores (including the "rogue" one) should not GC + // their replicas. We'll turn this back on when needed. + sc.TestingKnobs.DisableReplicaGCQueue = true + mtc := &multiTestContext{ + storeConfig: &sc, + } + defer mtc.Stop() + mtc.Start(t, 3) + + // We're going to set up the cluster with partitioning so that we can + // partition node 0 from the others. We do this by installing + // unreliableRaftHandler listeners on all three Stores which we can enable + // and disable with an atomic. The handler on the partitioned store filters + // out all messages while the handler on the other two stores only filters + // out messages from the partitioned store. When activated the configuration + // looks like: + // + // [0] + // x x + // / \ + // x x + // [1]<---->[2] + const partStore = 0 + var partitioned atomic.Value + partitioned.Store(false) + partRepl, err := mtc.stores[partStore].GetReplica(1) + if err != nil { + t.Fatal(err) + } + partReplDesc, err := partRepl.GetReplicaDescriptor() + if err != nil { + t.Fatal(err) + } + for _, s := range []int{0, 1, 2} { + s := s + h := &unreliableRaftHandler{ + rangeID: 1, + RaftMessageHandler: &mtcStoreRaftMessageHandler{ + mtc: mtc, + storeIdx: s, + }, + } + // Only filter messages from the partitioned store on the other + // two stores. + h.dropReq = func(req *storage.RaftMessageRequest) bool { + return partitioned.Load().(bool) && + (s == partStore || req.FromReplica.StoreID == partRepl.StoreID()) + } + h.dropHB = func(hb *storage.RaftHeartbeat) bool { + return partitioned.Load().(bool) && + (s == partStore || hb.FromReplicaID == partReplDesc.ReplicaID) + } + mtc.transport.Listen(mtc.stores[s].Ident.StoreID, h) + } + + // First put the range on all three nodes. + raftID := roachpb.RangeID(1) + mtc.replicateRange(raftID, 1, 2) + + // Put some data in the range so we'll have something to test for. + incArgs := incrementArgs([]byte("a"), 5) + if _, err := client.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil { + t.Fatal(err) + } + + // Wait for all nodes to catch up. + mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5}) + + // Now we partition a replica's LHS, perform a split, removes the RHS, add it + // back and make sure it votes. Then we'll do the two different cases of + // the store and not killing the store before we clear the partition. +} + +// TestUpgradeFromPreemptiveSnapshot exercises scenarios where a store has +// received a preemptive snapshot for a range and then later is added to the +// range as a learner. +// +// In these cases it is critical that the store destroy the replica created by +// the preemptive snapshot otherwise the replica may try to catch up across +// commands which are not safe (namely merges). +// Before learner replicas we would not add a store to a range until we had +// successfully sent a preemptive snapshot that contained the range descriptor +// used in the change replicas request. +func TestUpgradeFromPreemptiveSnapshot(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + var proposalFilter atomic.Value + noopProposalFilter := storagebase.ReplicaProposalFilter(func(storagebase.ProposalFilterArgs) *roachpb.Error { + return nil + }) + blockAllProposalFilter := storagebase.ReplicaProposalFilter(func(args storagebase.ProposalFilterArgs) *roachpb.Error { + return roachpb.NewError(errors.Errorf("boom")) + }) + setupTestCluster := func(t *testing.T) *testcluster.TestCluster { + proposalFilter.Store(noopProposalFilter) + tcArgs := base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgs: base.TestServerArgs{ + Knobs: base.TestingKnobs{ + Store: &storage.StoreTestingKnobs{ + BootstrapVersion: &cluster.ClusterVersion{ + cluster.VersionByKey(cluster.Version19_1), + }, + DisableLoadBasedSplitting: true, + DisableMergeQueue: true, + TestingProposalFilter: func(args storagebase.ProposalFilterArgs) *roachpb.Error { + return proposalFilter.Load().(storagebase.ReplicaProposalFilter)(args) + }, + }, + Server: &server.TestingKnobs{ + DisableAutomaticVersionUpgrade: 1, + }, + }, + }, + } + return testcluster.StartTestCluster(t, 4, tcArgs) + } + t.Run("add after preemptive snapshot before merge", func(t *testing.T) { + tc := setupTestCluster(t) + defer tc.Stopper().Stop(ctx) + scratchStartKey := tc.ScratchRange(t) + keyA := append(scratchStartKey[:len(scratchStartKey):len(scratchStartKey)], 'a') + + // Set up a scratch range on n1, n2, and n3. + tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1), tc.Target(2)) + // Split that range. + err := tc.Server(0).DB().AdminSplit(ctx, scratchStartKey, keyA, hlc.Timestamp{}) + require.NoError(t, err) + // Prevent n4 from successfully being added by blocking the proposal. + proposalFilter.Store(blockAllProposalFilter) + // Issue the add to send a preemptive snapshot. + _, err = tc.AddReplicas(scratchStartKey, tc.Target(3)) + // Ensure that the add failed with our proposal filter error. + require.True(t, testutils.IsError(err, "boom"), err) + // Reset the filter. + proposalFilter.Store(noopProposalFilter) + // Merge the range back together. + err = tc.Server(0).DB().AdminMerge(ctx, scratchStartKey) + require.NoError(t, err) + // Upgrade the cluster to use learners. + _, err = tc.ServerConn(0). + Exec("SET CLUSTER SETTING version = crdb_internal.node_executable_version();") + require.NoError(t, err) + // Successfully add node 4, this used to fail as node 4 would attemp to + // catch up from the preemptive snapshot across the merge. + desc := tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(3)) + // Transfer the lease to node 4 and remove node 0 to ensure that it really + // is a part of the range. + require.NoError(t, tc.TransferRangeLease(desc, tc.Target(3))) + tc.RemoveReplicasOrFatal(t, scratchStartKey, tc.Target(0)) + }) +} diff --git a/pkg/storage/client_replica_gc_test.go b/pkg/storage/client_replica_gc_test.go index 7b8e41cf1b36..4162ffca8d85 100644 --- a/pkg/storage/client_replica_gc_test.go +++ b/pkg/storage/client_replica_gc_test.go @@ -148,8 +148,11 @@ func TestReplicaGCQueueDropReplicaDirect(t *testing.T) { // removes a range from a store that no longer should have a replica. func TestReplicaGCQueueDropReplicaGCOnScan(t *testing.T) { defer leaktest.AfterTest(t)() - mtc := &multiTestContext{} + cfg := storage.TestStoreConfig(nil) + cfg.TestingKnobs.DisableEagerReplicaRemoval = true + mtc.storeConfig = &cfg + defer mtc.Stop() mtc.Start(t, 3) // Disable the replica gc queue to prevent direct removal of replica. diff --git a/pkg/storage/client_split_test.go b/pkg/storage/client_split_test.go index 6db3d819b5f9..f3cb67ba9fc8 100644 --- a/pkg/storage/client_split_test.go +++ b/pkg/storage/client_split_test.go @@ -3314,8 +3314,12 @@ func TestSplitTriggerMeetsUnexpectedReplicaID(t *testing.T) { // different replicaID than the split trigger expects. add := func() { _, err := tc.AddReplicas(kRHS, tc.Target(1)) - if !testutils.IsError(err, `snapshot intersects existing range`) { - t.Fatalf(`expected snapshot intersects existing range" error got: %+v`, err) + // The "snapshot intersects existing range" error is expected if the store + // has not heard a raft message addressed to a later replica ID while the + // "was not found on" error is expected if the store has heard that it has + // a newer replica ID before receiving the snapshot. + if !testutils.IsError(err, `snapshot intersects existing range|r[0-9]+ was not found on s[0-9]+`) { + t.Fatalf(`expected snapshot intersects existing range|r[0-9]+ was not found on s[0-9]+" error got: %+v`, err) } } for i := 0; i < 5; i++ { @@ -3361,7 +3365,8 @@ func TestSplitTriggerMeetsUnexpectedReplicaID(t *testing.T) { if err != nil { return err } - if desc := repl.Desc(); !descLHS.Equal(desc) { + if desc := repl.Desc(); desc.IsInitialized() && !descLHS.Equal(desc) { + require.NoError(t, store.ManualReplicaGC(repl)) return errors.Errorf("expected %s got %s", &descLHS, desc) } return nil diff --git a/pkg/storage/client_test.go b/pkg/storage/client_test.go index 11c57695ff08..ff93915bb36e 100644 --- a/pkg/storage/client_test.go +++ b/pkg/storage/client_test.go @@ -1243,6 +1243,21 @@ func (m *multiTestContext) unreplicateRangeNonFatal(rangeID roachpb.RangeID, des return err } +func (m *multiTestContext) waitForUnreplicated(rangeID roachpb.RangeID, dest int) error { + // Wait for the unreplications to complete on destination node. + return retry.ForDuration(testutils.DefaultSucceedsSoonDuration, func() error { + _, err := m.stores[dest].GetReplica(rangeID) + switch err.(type) { + case nil: + return fmt.Errorf("replica still exists on dest %d", dest) + case *roachpb.RangeNotFoundError: + return nil + default: + return err + } + }) +} + // readIntFromEngines reads the current integer value at the given key // from all configured engines, filling in zeros when the value is not // found. Returns a slice of the same length as mtc.engines. diff --git a/pkg/storage/helpers_test.go b/pkg/storage/helpers_test.go index 3e3530971c45..ea54e03d5bd6 100644 --- a/pkg/storage/helpers_test.go +++ b/pkg/storage/helpers_test.go @@ -221,16 +221,6 @@ func NewTestStorePool(cfg StoreConfig) *StorePool { ) } -func (r *Replica) ReplicaID() roachpb.ReplicaID { - r.mu.RLock() - defer r.mu.RUnlock() - return r.ReplicaIDLocked() -} - -func (r *Replica) ReplicaIDLocked() roachpb.ReplicaID { - return r.mu.replicaID -} - func (r *Replica) AssertState(ctx context.Context, reader engine.Reader) { r.raftMu.Lock() defer r.raftMu.Unlock() @@ -269,10 +259,13 @@ func (r *Replica) InitQuotaPool(quota uint64) error { r.mu.Lock() defer r.mu.Unlock() var appliedIndex uint64 - err := r.withRaftGroupLocked(false, func(r *raft.RawNode) (unquiesceAndWakeLeader bool, err error) { + isRemoved, err := r.withRaftGroupLocked(false, func(r *raft.RawNode) (unquiesceAndWakeLeader bool, err error) { appliedIndex = r.BasicStatus().Applied return false, nil }) + if isRemoved { + _, err = r.IsDestroyed() + } if err != nil { return err } diff --git a/pkg/storage/merge_queue.go b/pkg/storage/merge_queue.go index 208499998750..7db6813b146b 100644 --- a/pkg/storage/merge_queue.go +++ b/pkg/storage/merge_queue.go @@ -291,13 +291,11 @@ func (mq *mergeQueue) process( log.VEventf(ctx, 2, `%v`, err) return err } - rhsDesc, err = maybeLeaveAtomicChangeReplicas(ctx, store, rhsDesc) if err != nil { log.VEventf(ctx, 2, `%v`, err) return err } - rhsDesc, err = removeLearners(ctx, db, rhsDesc) if err != nil { log.VEventf(ctx, 2, `%v`, err) diff --git a/pkg/storage/queue.go b/pkg/storage/queue.go index 3ef925cc0304..ca3a996f2210 100644 --- a/pkg/storage/queue.go +++ b/pkg/storage/queue.go @@ -59,8 +59,9 @@ type processCallback func(error) // A replicaItem holds a replica and metadata about its queue state and // processing state. type replicaItem struct { - value roachpb.RangeID - seq int // enforce FIFO order for equal priorities + value roachpb.RangeID + replicaID roachpb.ReplicaID + seq int // enforce FIFO order for equal priorities // fields used when a replicaItem is enqueued in a priority queue. priority float64 @@ -89,6 +90,12 @@ func (i *replicaItem) registerCallback(cb processCallback) { i.callbacks = append(i.callbacks, cb) } +// sameReplica returns true if the passed replicaID matches the item's replica +// ID or the item's replica ID is zero. +func (i *replicaItem) sameReplica(replicaID roachpb.ReplicaID) bool { + return i.replicaID == 0 || i.replicaID == replicaID +} + // A priorityQueue implements heap.Interface and holds replicaItems. type priorityQueue struct { seqGen int @@ -180,6 +187,7 @@ func shouldQueueAgain(now, last hlc.Timestamp, minInterval time.Duration) (bool, // extraction. Establish a sane interface and use that. type replicaInQueue interface { AnnotateCtx(context.Context) context.Context + ReplicaID() roachpb.ReplicaID StoreID() roachpb.StoreID GetRangeID() roachpb.RangeID IsInitialized() bool @@ -487,7 +495,7 @@ func (h baseQueueHelper) MaybeAdd(ctx context.Context, repl replicaInQueue, now } func (h baseQueueHelper) Add(ctx context.Context, repl replicaInQueue, prio float64) { - _, err := h.bq.addInternal(ctx, repl.Desc(), prio) + _, err := h.bq.addInternal(ctx, repl.Desc(), repl.ReplicaID(), prio) if err != nil && log.V(1) { log.Infof(ctx, "during Add: %s", err) } @@ -595,7 +603,7 @@ func (bq *baseQueue) maybeAdd(ctx context.Context, repl replicaInQueue, now hlc. if !should { return } - if _, err := bq.addInternal(ctx, repl.Desc(), priority); !isExpectedQueueError(err) { + if _, err := bq.addInternal(ctx, repl.Desc(), repl.ReplicaID(), priority); !isExpectedQueueError(err) { log.Errorf(ctx, "unable to add: %+v", err) } } @@ -612,7 +620,7 @@ func (bq *baseQueue) requiresSplit(cfg *config.SystemConfig, repl replicaInQueue // the replica is already queued at a lower priority, updates the existing // priority. Expects the queue lock to be held by caller. func (bq *baseQueue) addInternal( - ctx context.Context, desc *roachpb.RangeDescriptor, priority float64, + ctx context.Context, desc *roachpb.RangeDescriptor, replicaID roachpb.ReplicaID, priority float64, ) (bool, error) { // NB: this is intentionally outside of bq.mu to avoid having to consider // lock ordering constraints. @@ -665,7 +673,7 @@ func (bq *baseQueue) addInternal( if log.V(3) { log.Infof(ctx, "adding: priority=%0.3f", priority) } - item = &replicaItem{value: desc.RangeID, priority: priority} + item = &replicaItem{value: desc.RangeID, replicaID: replicaID, priority: priority} bq.addLocked(item) // If adding this replica has pushed the queue past its maximum size, @@ -856,6 +864,9 @@ func (bq *baseQueue) processReplica(ctx context.Context, repl replicaInQueue) er if !repl.IsInitialized() { // We checked this when adding the replica, but we need to check it again // in case this is a different replica with the same range ID (see #14193). + // This is possible in the case where the replica was enqueued while not + // having a replica ID, perhaps due to a pre-emptive snapshot, and has + // since been removed and re-added at a different replica ID. return errors.New("cannot process uninitialized replica") } @@ -1092,21 +1103,21 @@ func (bq *baseQueue) addToPurgatoryLocked( // Remove all items from purgatory into a copied slice. bq.mu.Lock() - ranges := make([]roachpb.RangeID, 0, len(bq.mu.purgatory)) + ranges := make([]*replicaItem, 0, len(bq.mu.purgatory)) for rangeID := range bq.mu.purgatory { item := bq.mu.replicas[rangeID] if item == nil { log.Fatalf(ctx, "r%d is in purgatory but not in replicas", rangeID) } item.setProcessing() - ranges = append(ranges, item.value) + ranges = append(ranges, item) bq.removeFromPurgatoryLocked(item) } bq.mu.Unlock() - for _, id := range ranges { - repl, err := bq.getReplica(id) - if err != nil { + for _, item := range ranges { + repl, err := bq.getReplica(item.value) + if err != nil || !item.sameReplica(repl.ReplicaID()) { continue } annotatedCtx := repl.AnnotateCtx(ctx) @@ -1168,11 +1179,12 @@ func (bq *baseQueue) pop() replicaInQueue { bq.mu.Unlock() repl, _ := bq.getReplica(item.value) - if repl != nil { + if repl != nil && item.sameReplica(repl.ReplicaID()) { return repl } - // Replica not found, remove from set and try again. + // Replica not found or was recreated with a new replica ID, remove from + // set and try again. bq.mu.Lock() bq.removeFromReplicaSetLocked(item.value) } diff --git a/pkg/storage/queue_concurrency_test.go b/pkg/storage/queue_concurrency_test.go index b87d4a8f1c27..afe28799175d 100644 --- a/pkg/storage/queue_concurrency_test.go +++ b/pkg/storage/queue_concurrency_test.go @@ -145,7 +145,8 @@ func (fakeQueueImpl) purgatoryChan() <-chan time.Time { } type fakeReplica struct { - id roachpb.RangeID + id roachpb.RangeID + replicaID roachpb.ReplicaID } func (fr *fakeReplica) AnnotateCtx(ctx context.Context) context.Context { return ctx } @@ -153,6 +154,7 @@ func (fr *fakeReplica) StoreID() roachpb.StoreID { return 1 } func (fr *fakeReplica) GetRangeID() roachpb.RangeID { return fr.id } +func (fr *fakeReplica) ReplicaID() roachpb.ReplicaID { return fr.replicaID } func (fr *fakeReplica) IsInitialized() bool { return true } func (fr *fakeReplica) IsDestroyed() (DestroyReason, error) { return destroyReasonAlive, nil } func (fr *fakeReplica) Desc() *roachpb.RangeDescriptor { diff --git a/pkg/storage/queue_helpers_testutil.go b/pkg/storage/queue_helpers_testutil.go index 687bebeb68be..addb45b3331d 100644 --- a/pkg/storage/queue_helpers_testutil.go +++ b/pkg/storage/queue_helpers_testutil.go @@ -23,7 +23,7 @@ import ( func (bq *baseQueue) testingAdd( ctx context.Context, repl replicaInQueue, priority float64, ) (bool, error) { - return bq.addInternal(ctx, repl.Desc(), priority) + return bq.addInternal(ctx, repl.Desc(), repl.ReplicaID(), priority) } func forceScanAndProcess(s *Store, q *baseQueue) error { diff --git a/pkg/storage/queue_test.go b/pkg/storage/queue_test.go index df2cd9bf4cf1..51d8dc4e18dd 100644 --- a/pkg/storage/queue_test.go +++ b/pkg/storage/queue_test.go @@ -13,6 +13,7 @@ package storage import ( "container/heap" "context" + "fmt" "strconv" "sync/atomic" "testing" @@ -33,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/gogo/protobuf/proto" "github.com/pkg/errors" + "github.com/stretchr/testify/require" ) // testQueueImpl implements queueImpl with a closure for shouldQueue. @@ -1099,6 +1101,42 @@ func TestBaseQueueProcessConcurrently(t *testing.T) { assertProcessedAndProcessing(3, 0) } +// TestBaseQueueReplicaChange ensures that if a replica is added to the queue +// with a non-zero replica ID then it is only popped if the retrieved replica +// from the getReplica() function has the same replica ID. +func TestBaseQueueChangeReplicaID(t *testing.T) { + defer leaktest.AfterTest(t)() + // The testContext exists only to construct the baseQueue. + tc := testContext{} + stopper := stop.NewStopper() + ctx := context.Background() + defer stopper.Stop(ctx) + tc.Start(t, stopper) + testQueue := &testQueueImpl{ + shouldQueueFn: func(now hlc.Timestamp, r *Replica) (shouldQueue bool, priority float64) { + return true, 1.0 + }, + } + bq := makeTestBaseQueue("test", testQueue, tc.store, tc.gossip, queueConfig{ + maxSize: defaultQueueMaxSize, + acceptsUnsplitRanges: true, + }) + r := &fakeReplica{id: 1, replicaID: 1} + bq.mu.Lock() + bq.getReplica = func(rangeID roachpb.RangeID) (replicaInQueue, error) { + if rangeID != 1 { + panic(fmt.Errorf("expected range id 1, got %d", rangeID)) + } + return r, nil + } + bq.mu.Unlock() + bq.maybeAdd(ctx, r, tc.store.Clock().Now()) + require.Equal(t, r, bq.pop()) + bq.maybeAdd(ctx, r, tc.store.Clock().Now()) + r.replicaID = 2 + require.Nil(t, bq.pop()) +} + func TestBaseQueueRequeue(t *testing.T) { defer leaktest.AfterTest(t)() tc := testContext{} diff --git a/pkg/storage/raft.pb.go b/pkg/storage/raft.pb.go index 97b93abcba9a..05c6e6ce9c58 100644 --- a/pkg/storage/raft.pb.go +++ b/pkg/storage/raft.pb.go @@ -70,7 +70,7 @@ func (x *SnapshotRequest_Priority) UnmarshalJSON(data []byte) error { return nil } func (SnapshotRequest_Priority) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{5, 0} + return fileDescriptor_raft_c419318fe988e310, []int{5, 0} } type SnapshotRequest_Strategy int32 @@ -107,7 +107,7 @@ func (x *SnapshotRequest_Strategy) UnmarshalJSON(data []byte) error { return nil } func (SnapshotRequest_Strategy) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{5, 1} + return fileDescriptor_raft_c419318fe988e310, []int{5, 1} } type SnapshotRequest_Type int32 @@ -146,7 +146,7 @@ func (x *SnapshotRequest_Type) UnmarshalJSON(data []byte) error { return nil } func (SnapshotRequest_Type) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{5, 2} + return fileDescriptor_raft_c419318fe988e310, []int{5, 2} } type SnapshotResponse_Status int32 @@ -191,7 +191,7 @@ func (x *SnapshotResponse_Status) UnmarshalJSON(data []byte) error { return nil } func (SnapshotResponse_Status) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{6, 0} + return fileDescriptor_raft_c419318fe988e310, []int{6, 0} } // RaftHeartbeat is a request that contains the barebones information for a @@ -205,13 +205,22 @@ type RaftHeartbeat struct { Term uint64 `protobuf:"varint,4,opt,name=term" json:"term"` Commit uint64 `protobuf:"varint,5,opt,name=commit" json:"commit"` Quiesce bool `protobuf:"varint,6,opt,name=quiesce" json:"quiesce"` + // ToIsLearner was added in v19.2 to aid in the transition from preemptive + // snapshots to learner replicas. If a Replica learns its ID from a message + // which indicates that it is a learner and it is not currently a part of the + // range (due to being from a preemptive snapshot) then it must delete all of + // its data. + // + // TODO(ajwerner): remove in 20.2 once we ensure that preemptive snapshots can + // no longer be present and that we're never talking to a 19.2 node. + ToIsLearner bool `protobuf:"varint,7,opt,name=to_is_learner,json=toIsLearner" json:"to_is_learner"` } func (m *RaftHeartbeat) Reset() { *m = RaftHeartbeat{} } func (m *RaftHeartbeat) String() string { return proto.CompactTextString(m) } func (*RaftHeartbeat) ProtoMessage() {} func (*RaftHeartbeat) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{0} + return fileDescriptor_raft_c419318fe988e310, []int{0} } func (m *RaftHeartbeat) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -267,7 +276,7 @@ func (m *RaftMessageRequest) Reset() { *m = RaftMessageRequest{} } func (m *RaftMessageRequest) String() string { return proto.CompactTextString(m) } func (*RaftMessageRequest) ProtoMessage() {} func (*RaftMessageRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{1} + return fileDescriptor_raft_c419318fe988e310, []int{1} } func (m *RaftMessageRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -300,7 +309,7 @@ func (m *RaftMessageRequestBatch) Reset() { *m = RaftMessageRequestBatch func (m *RaftMessageRequestBatch) String() string { return proto.CompactTextString(m) } func (*RaftMessageRequestBatch) ProtoMessage() {} func (*RaftMessageRequestBatch) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{2} + return fileDescriptor_raft_c419318fe988e310, []int{2} } func (m *RaftMessageRequestBatch) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -333,7 +342,7 @@ func (m *RaftMessageResponseUnion) Reset() { *m = RaftMessageResponseUni func (m *RaftMessageResponseUnion) String() string { return proto.CompactTextString(m) } func (*RaftMessageResponseUnion) ProtoMessage() {} func (*RaftMessageResponseUnion) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{3} + return fileDescriptor_raft_c419318fe988e310, []int{3} } func (m *RaftMessageResponseUnion) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -376,7 +385,7 @@ func (m *RaftMessageResponse) Reset() { *m = RaftMessageResponse{} } func (m *RaftMessageResponse) String() string { return proto.CompactTextString(m) } func (*RaftMessageResponse) ProtoMessage() {} func (*RaftMessageResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{4} + return fileDescriptor_raft_c419318fe988e310, []int{4} } func (m *RaftMessageResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -417,7 +426,7 @@ func (m *SnapshotRequest) Reset() { *m = SnapshotRequest{} } func (m *SnapshotRequest) String() string { return proto.CompactTextString(m) } func (*SnapshotRequest) ProtoMessage() {} func (*SnapshotRequest) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{5} + return fileDescriptor_raft_c419318fe988e310, []int{5} } func (m *SnapshotRequest) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -479,7 +488,7 @@ func (m *SnapshotRequest_Header) Reset() { *m = SnapshotRequest_Header{} func (m *SnapshotRequest_Header) String() string { return proto.CompactTextString(m) } func (*SnapshotRequest_Header) ProtoMessage() {} func (*SnapshotRequest_Header) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{5, 0} + return fileDescriptor_raft_c419318fe988e310, []int{5, 0} } func (m *SnapshotRequest_Header) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -513,7 +522,7 @@ func (m *SnapshotResponse) Reset() { *m = SnapshotResponse{} } func (m *SnapshotResponse) String() string { return proto.CompactTextString(m) } func (*SnapshotResponse) ProtoMessage() {} func (*SnapshotResponse) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{6} + return fileDescriptor_raft_c419318fe988e310, []int{6} } func (m *SnapshotResponse) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -550,7 +559,7 @@ func (m *ConfChangeContext) Reset() { *m = ConfChangeContext{} } func (m *ConfChangeContext) String() string { return proto.CompactTextString(m) } func (*ConfChangeContext) ProtoMessage() {} func (*ConfChangeContext) Descriptor() ([]byte, []int) { - return fileDescriptor_raft_d6a64da2f1251cf6, []int{7} + return fileDescriptor_raft_c419318fe988e310, []int{7} } func (m *ConfChangeContext) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -798,6 +807,14 @@ func (m *RaftHeartbeat) MarshalTo(dAtA []byte) (int, error) { dAtA[i] = 0 } i++ + dAtA[i] = 0x38 + i++ + if m.ToIsLearner { + dAtA[i] = 1 + } else { + dAtA[i] = 0 + } + i++ return i, nil } @@ -1173,6 +1190,7 @@ func (m *RaftHeartbeat) Size() (n int) { n += 1 + sovRaft(uint64(m.Term)) n += 1 + sovRaft(uint64(m.Commit)) n += 2 + n += 2 return n } @@ -1496,6 +1514,26 @@ func (m *RaftHeartbeat) Unmarshal(dAtA []byte) error { } } m.Quiesce = bool(v != 0) + case 7: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field ToIsLearner", wireType) + } + var v int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowRaft + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + v |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + m.ToIsLearner = bool(v != 0) default: iNdEx = preIndex skippy, err := skipRaft(dAtA[iNdEx:]) @@ -2814,85 +2852,86 @@ var ( ErrIntOverflowRaft = fmt.Errorf("proto: integer overflow") ) -func init() { proto.RegisterFile("storage/raft.proto", fileDescriptor_raft_d6a64da2f1251cf6) } - -var fileDescriptor_raft_d6a64da2f1251cf6 = []byte{ - // 1224 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe4, 0x56, 0x5f, 0x6f, 0xdb, 0x54, - 0x14, 0x8f, 0x1b, 0x27, 0x71, 0x4e, 0x92, 0xd6, 0xbb, 0x4c, 0x60, 0x85, 0x91, 0x46, 0x1e, 0x1b, - 0xd9, 0x10, 0xc9, 0xa8, 0x06, 0x0f, 0xbc, 0xe5, 0x8f, 0xbb, 0x66, 0xfd, 0x17, 0xb9, 0x59, 0x11, - 0x48, 0x53, 0xe4, 0x38, 0xb7, 0x89, 0xd5, 0xc4, 0xd7, 0xb3, 0x6f, 0x06, 0xd9, 0xa7, 0xe0, 0x23, - 0xf0, 0xca, 0x27, 0xe0, 0x2b, 0xf4, 0x05, 0x69, 0x8f, 0x93, 0x40, 0x15, 0x74, 0x7c, 0x8a, 0x3d, - 0xa1, 0xfb, 0xc7, 0x69, 0xd6, 0x06, 0xd6, 0x4a, 0x88, 0x17, 0x5e, 0x12, 0xfb, 0x9c, 0xf3, 0xfb, - 0x1d, 0x9f, 0xbf, 0xf7, 0x02, 0x8a, 0x28, 0x09, 0x9d, 0x21, 0xae, 0x85, 0xce, 0x11, 0xad, 0x06, - 0x21, 0xa1, 0x04, 0xdd, 0x70, 0x89, 0x7b, 0x1c, 0x12, 0xc7, 0x1d, 0x55, 0xa5, 0xb6, 0x78, 0x93, - 0xbf, 0x06, 0xfd, 0x1a, 0x0e, 0x43, 0x12, 0x46, 0xc2, 0xb0, 0xf8, 0x7e, 0x2c, 0x9d, 0x60, 0xea, - 0x0c, 0x1c, 0xea, 0x48, 0xf9, 0x47, 0x31, 0xa9, 0xfc, 0x0f, 0xfa, 0xb5, 0x88, 0x3a, 0x14, 0x4b, - 0xf5, 0x87, 0x98, 0xba, 0x03, 0xee, 0x90, 0xff, 0x04, 0xfd, 0x05, 0xe7, 0xc5, 0x9b, 0x43, 0x32, - 0x24, 0xfc, 0xb1, 0xc6, 0x9e, 0x84, 0xd4, 0xfc, 0x39, 0x09, 0x05, 0xdb, 0x39, 0xa2, 0x5b, 0xd8, - 0x09, 0x69, 0x1f, 0x3b, 0x14, 0xf5, 0x41, 0x0b, 0x1d, 0x7f, 0x88, 0x7b, 0xde, 0xc0, 0x50, 0xca, - 0x4a, 0x45, 0x6d, 0x3c, 0x3a, 0x39, 0x5d, 0x4f, 0x9c, 0x9d, 0xae, 0x67, 0x6c, 0x26, 0x6f, 0xb7, - 0xde, 0x9c, 0xae, 0x3f, 0x1c, 0x7a, 0x74, 0x34, 0xed, 0x57, 0x5d, 0x32, 0xa9, 0xcd, 0x83, 0x1a, - 0xf4, 0xcf, 0x9f, 0x6b, 0xc1, 0xf1, 0xb0, 0x26, 0xa3, 0xa8, 0x4a, 0x9c, 0x9d, 0xe1, 0xc4, 0xed, - 0x01, 0xfa, 0x0e, 0xd6, 0x8e, 0x42, 0x32, 0xe9, 0x85, 0x38, 0x18, 0x7b, 0xae, 0xc3, 0x5c, 0xad, - 0x94, 0x95, 0x4a, 0xa1, 0xb1, 0x2f, 0x5d, 0x15, 0x36, 0x43, 0x32, 0xb1, 0x85, 0x96, 0x3b, 0xfc, - 0xf2, 0x7a, 0x0e, 0x63, 0xa4, 0x5d, 0x38, 0x5a, 0x20, 0x1a, 0xa0, 0x67, 0x50, 0xa0, 0x64, 0xd1, - 0x6d, 0x92, 0xbb, 0xdd, 0x95, 0x6e, 0x73, 0x5d, 0xf2, 0x6f, 0x38, 0xcd, 0x51, 0x72, 0xee, 0xd2, - 0x00, 0x95, 0xe2, 0x70, 0x62, 0xa8, 0x3c, 0x97, 0x2a, 0xf3, 0x64, 0x73, 0x09, 0xba, 0x05, 0x69, - 0x97, 0x4c, 0x26, 0x1e, 0x35, 0x52, 0x0b, 0x3a, 0x29, 0x43, 0x25, 0xc8, 0x3c, 0x9b, 0x7a, 0x38, - 0x72, 0xb1, 0x91, 0x2e, 0x2b, 0x15, 0x4d, 0xaa, 0x63, 0xa1, 0xf9, 0xab, 0x0a, 0x88, 0x55, 0x6e, - 0x17, 0x47, 0x91, 0x33, 0xc4, 0x36, 0x7e, 0x36, 0xc5, 0xd1, 0x7f, 0x53, 0xbe, 0x5d, 0xc8, 0x2f, - 0x96, 0x8f, 0xd7, 0x2e, 0xb7, 0xf1, 0x71, 0xf5, 0xbc, 0xbd, 0x2f, 0xe4, 0xa4, 0x85, 0x23, 0x37, - 0xf4, 0x02, 0x4a, 0x42, 0x19, 0x45, 0x6e, 0xa1, 0x2c, 0xa8, 0x0d, 0x70, 0x5e, 0x14, 0x5e, 0x91, - 0xeb, 0x91, 0x65, 0xe7, 0xe9, 0x46, 0x35, 0xc8, 0x4c, 0x44, 0x3e, 0x78, 0xbe, 0x73, 0x1b, 0x6b, - 0x55, 0x31, 0x09, 0x55, 0x99, 0xa6, 0x38, 0x8b, 0xd2, 0x6a, 0x31, 0xcb, 0xa9, 0x25, 0x59, 0x46, - 0x9b, 0x00, 0xa3, 0x78, 0x34, 0x22, 0x23, 0x5d, 0x4e, 0x56, 0x72, 0x1b, 0xe5, 0xea, 0xa5, 0x39, - 0xae, 0xbe, 0x35, 0x43, 0x92, 0x64, 0x01, 0x89, 0xf6, 0x61, 0x6d, 0xfe, 0xd6, 0x0b, 0x71, 0x14, - 0x44, 0x46, 0xe6, 0x5a, 0x64, 0xab, 0x73, 0xb8, 0xcd, 0xd0, 0xe8, 0x29, 0xac, 0x89, 0x3a, 0x47, - 0xd4, 0x09, 0x69, 0xef, 0x18, 0xcf, 0x0c, 0xad, 0xac, 0x54, 0xf2, 0x8d, 0x2f, 0xde, 0x9c, 0xae, - 0x7f, 0x7e, 0xbd, 0xfa, 0x6e, 0xe3, 0x99, 0x5d, 0xe0, 0x6c, 0x07, 0x8c, 0x6c, 0x1b, 0xcf, 0xcc, - 0x3e, 0x7c, 0x70, 0xb9, 0xb9, 0x1a, 0x0e, 0x75, 0x47, 0xe8, 0x11, 0x68, 0xa1, 0x78, 0x8f, 0x0c, - 0x85, 0xc7, 0x70, 0xe7, 0x6f, 0x62, 0xb8, 0x80, 0x16, 0x81, 0xcc, 0xc1, 0x66, 0x07, 0x8c, 0xb7, - 0xac, 0xa2, 0x80, 0xf8, 0x11, 0x7e, 0xe2, 0x7b, 0xc4, 0x47, 0x55, 0x48, 0xf1, 0x8d, 0xc8, 0x7b, - 0x38, 0xb7, 0x61, 0x2c, 0x69, 0x07, 0x8b, 0xe9, 0x6d, 0x61, 0xf6, 0x95, 0x7a, 0xf2, 0xe3, 0xba, - 0x62, 0xfe, 0xb6, 0x02, 0xef, 0x2d, 0xa1, 0xfc, 0x9f, 0x0f, 0xc5, 0x23, 0x48, 0x4d, 0x59, 0x52, - 0xe5, 0x48, 0x7c, 0xfa, 0xae, 0x6a, 0x2d, 0xd4, 0x41, 0x92, 0x09, 0xbc, 0xf9, 0x67, 0x1a, 0xd6, - 0x0e, 0x7c, 0x27, 0x88, 0x46, 0x84, 0xc6, 0xfb, 0xa6, 0x0e, 0xe9, 0x11, 0x76, 0x06, 0x38, 0xae, - 0xd4, 0xbd, 0x25, 0xec, 0x17, 0x30, 0xd5, 0x2d, 0x0e, 0xb0, 0x25, 0x10, 0xdd, 0x05, 0xed, 0xf8, - 0x79, 0xaf, 0xcf, 0x9a, 0x8b, 0x67, 0x2d, 0xdf, 0xc8, 0xb1, 0xca, 0x6c, 0x1f, 0xf2, 0x7e, 0xb3, - 0x33, 0xc7, 0xcf, 0x45, 0xe3, 0xad, 0x43, 0x6e, 0x4c, 0x86, 0x3d, 0xec, 0xd3, 0xd0, 0xc3, 0x91, - 0x91, 0x2c, 0x27, 0x2b, 0x79, 0x1b, 0xc6, 0x64, 0x68, 0x09, 0x09, 0x2a, 0x42, 0xea, 0xc8, 0xf3, - 0x9d, 0x31, 0x0f, 0x34, 0x1e, 0x65, 0x21, 0x2a, 0xfe, 0xa4, 0x42, 0x5a, 0xf8, 0x45, 0x4f, 0xe1, - 0x26, 0x5b, 0x0a, 0x3d, 0xb9, 0x03, 0x7a, 0xb2, 0x21, 0x65, 0xc5, 0xae, 0xd5, 0xcc, 0x28, 0xbc, - 0xbc, 0x81, 0x6f, 0x03, 0xc8, 0xc9, 0xf4, 0x5e, 0x60, 0x5e, 0xb9, 0x64, 0x5c, 0x13, 0x31, 0x63, - 0xde, 0x0b, 0x8c, 0xee, 0x40, 0xce, 0x75, 0xfc, 0xde, 0x00, 0xbb, 0x63, 0xcf, 0xc7, 0x6f, 0x7d, - 0x30, 0xb8, 0x8e, 0xdf, 0x12, 0x72, 0x64, 0x41, 0x8a, 0x1f, 0xf0, 0x7c, 0x39, 0x2d, 0x4f, 0xee, - 0xfc, 0x2a, 0x10, 0xb7, 0xc2, 0x01, 0x03, 0xc4, 0xc1, 0x73, 0x34, 0xda, 0x05, 0x2d, 0x08, 0x3d, - 0x12, 0x7a, 0x74, 0xc6, 0x0f, 0x93, 0xd5, 0xa5, 0x4d, 0x70, 0xb1, 0x4c, 0x1d, 0x09, 0x89, 0x07, - 0x37, 0xa6, 0x60, 0x74, 0x11, 0x0d, 0x1d, 0x8a, 0x87, 0x33, 0x23, 0x73, 0x65, 0xba, 0x03, 0x09, - 0x89, 0xe9, 0x62, 0x0a, 0xb4, 0x09, 0xb7, 0xa6, 0xbe, 0xec, 0x74, 0x8a, 0x07, 0x3d, 0x1a, 0x4e, - 0x7d, 0xf1, 0x24, 0x62, 0xd7, 0x16, 0x92, 0x53, 0x5c, 0xb4, 0xec, 0xc6, 0x86, 0x3c, 0x64, 0x54, - 0x07, 0x95, 0xce, 0x02, 0x6c, 0x64, 0xf9, 0x27, 0x7d, 0x72, 0x85, 0x4f, 0xea, 0xce, 0x02, 0x3c, - 0x3f, 0x92, 0x67, 0x01, 0x7e, 0xac, 0x6a, 0x8a, 0xbe, 0x62, 0x3e, 0x04, 0x2d, 0x8e, 0x1d, 0xe5, - 0x20, 0xf3, 0x64, 0x6f, 0x7b, 0x6f, 0xff, 0xeb, 0x3d, 0x3d, 0x81, 0xf2, 0xa0, 0xd9, 0x56, 0x73, - 0xff, 0xd0, 0xb2, 0xbf, 0xd1, 0x15, 0x54, 0x80, 0xac, 0x6d, 0x35, 0xea, 0x3b, 0xf5, 0xbd, 0xa6, - 0xa5, 0xaf, 0x98, 0x06, 0x68, 0x71, 0x88, 0xcc, 0x70, 0xfb, 0xb0, 0xd7, 0xa8, 0x77, 0x9b, 0x5b, - 0x7a, 0xc2, 0xfc, 0x0c, 0x54, 0xe6, 0x09, 0x69, 0xa0, 0xda, 0xf5, 0xcd, 0xae, 0x9e, 0x60, 0xac, - 0x3b, 0x56, 0xdd, 0xde, 0xb3, 0x6c, 0x5d, 0x41, 0xab, 0x00, 0x1d, 0xdb, 0xb2, 0x76, 0x3b, 0xdd, - 0xf6, 0x21, 0x23, 0xfa, 0x45, 0x01, 0xfd, 0xfc, 0x4b, 0xe5, 0x0a, 0xdb, 0x82, 0x34, 0xcb, 0xc6, - 0x34, 0xe2, 0x73, 0xb6, 0xba, 0x71, 0xff, 0x1f, 0xc3, 0x13, 0xa0, 0xea, 0x01, 0x47, 0xc4, 0x17, - 0x0b, 0x81, 0x67, 0x47, 0x5e, 0x7c, 0x46, 0xb2, 0x8e, 0xcf, 0x5e, 0x38, 0x12, 0xcd, 0x36, 0xa4, - 0x05, 0xee, 0x52, 0xec, 0xf5, 0x66, 0xd3, 0xea, 0x74, 0xad, 0x96, 0xae, 0x30, 0x55, 0xbd, 0xd3, - 0xd9, 0x69, 0x5b, 0x2d, 0x7d, 0x05, 0x65, 0x21, 0x65, 0xd9, 0xf6, 0xbe, 0xad, 0x27, 0x99, 0x55, - 0xcb, 0x6a, 0xee, 0xb4, 0xf7, 0xac, 0x96, 0xae, 0x3e, 0x56, 0xb5, 0xa4, 0xae, 0x9a, 0x3d, 0xb8, - 0xd1, 0x24, 0xfe, 0x51, 0x73, 0xc4, 0xba, 0xbf, 0x49, 0x7c, 0x8a, 0xbf, 0xa7, 0xe8, 0x01, 0x00, - 0xbb, 0xe8, 0x38, 0xfe, 0x20, 0x5e, 0xca, 0xd9, 0xc6, 0x0d, 0xb9, 0x94, 0xb3, 0x4d, 0xa1, 0x69, - 0xb7, 0xec, 0xac, 0x34, 0xe2, 0x17, 0xa9, 0x4c, 0xe0, 0xcc, 0xc6, 0xc4, 0x11, 0x97, 0xc5, 0xbc, - 0x1d, 0xbf, 0x6e, 0xbc, 0x52, 0x20, 0xbb, 0x3b, 0x1d, 0x53, 0x8f, 0xcd, 0x29, 0x1a, 0x83, 0xbe, - 0x30, 0xaf, 0x62, 0x75, 0xdc, 0xbf, 0xda, 0x50, 0x33, 0xdb, 0xe2, 0xdd, 0xab, 0xed, 0x47, 0x33, - 0x51, 0x51, 0x1e, 0x28, 0xe8, 0x29, 0xe4, 0x99, 0x32, 0x4e, 0x3d, 0x32, 0xdf, 0xdd, 0x76, 0xc5, - 0xdb, 0x57, 0xa8, 0x9d, 0xa0, 0x6f, 0xdc, 0x3b, 0xf9, 0xa3, 0x94, 0x38, 0x39, 0x2b, 0x29, 0x2f, - 0xcf, 0x4a, 0xca, 0xab, 0xb3, 0x92, 0xf2, 0xfb, 0x59, 0x49, 0xf9, 0xe1, 0x75, 0x29, 0xf1, 0xf2, - 0x75, 0x29, 0xf1, 0xea, 0x75, 0x29, 0xf1, 0x6d, 0x46, 0x32, 0xfc, 0x15, 0x00, 0x00, 0xff, 0xff, - 0x38, 0x31, 0xfb, 0x9d, 0x72, 0x0c, 0x00, 0x00, +func init() { proto.RegisterFile("storage/raft.proto", fileDescriptor_raft_c419318fe988e310) } + +var fileDescriptor_raft_c419318fe988e310 = []byte{ + // 1243 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe4, 0x56, 0x4b, 0x6f, 0xdb, 0x46, + 0x10, 0x16, 0x2d, 0x5a, 0x8f, 0x91, 0x64, 0x33, 0xdb, 0xa0, 0x25, 0xd4, 0x54, 0x16, 0x94, 0x26, + 0x55, 0x52, 0x54, 0x4a, 0x8d, 0xb4, 0x87, 0xde, 0xf4, 0xa0, 0x63, 0xc5, 0x2f, 0x81, 0x56, 0x5c, + 0xb4, 0x40, 0x20, 0x50, 0xd4, 0x5a, 0x22, 0x2c, 0x71, 0x99, 0xe5, 0x2a, 0xad, 0xf2, 0x2b, 0xfa, + 0x13, 0x7a, 0xed, 0x3f, 0xf1, 0xa5, 0x40, 0x8e, 0x01, 0x5a, 0x18, 0x8d, 0xd3, 0x5f, 0x91, 0x53, + 0xb1, 0x0f, 0xca, 0x8c, 0xed, 0x36, 0x36, 0x50, 0xf4, 0xd2, 0x8b, 0x44, 0xce, 0xe3, 0x1b, 0xce, + 0x7c, 0x33, 0xb3, 0x0b, 0x28, 0x64, 0x84, 0x3a, 0x23, 0x5c, 0xa7, 0xce, 0x21, 0xab, 0x05, 0x94, + 0x30, 0x82, 0x6e, 0xb8, 0xc4, 0x3d, 0xa2, 0xc4, 0x71, 0xc7, 0x35, 0xa5, 0x2d, 0xde, 0x14, 0xaf, + 0xc1, 0xa0, 0x8e, 0x29, 0x25, 0x34, 0x94, 0x86, 0xc5, 0x0f, 0x23, 0xe9, 0x14, 0x33, 0x67, 0xe8, + 0x30, 0x47, 0xc9, 0x3f, 0x89, 0x40, 0xd5, 0x7f, 0x30, 0xa8, 0x87, 0xcc, 0x61, 0x58, 0xa9, 0x3f, + 0xc6, 0xcc, 0x1d, 0x8a, 0x80, 0xe2, 0x27, 0x18, 0xc4, 0x82, 0x17, 0x6f, 0x8e, 0xc8, 0x88, 0x88, + 0xc7, 0x3a, 0x7f, 0x92, 0xd2, 0xca, 0xeb, 0x24, 0x14, 0x6c, 0xe7, 0x90, 0x6d, 0x62, 0x87, 0xb2, + 0x01, 0x76, 0x18, 0x1a, 0x40, 0x86, 0x3a, 0xfe, 0x08, 0xf7, 0xbd, 0xa1, 0xa9, 0x95, 0xb5, 0xaa, + 0xde, 0x7c, 0x74, 0x7c, 0xb2, 0x96, 0x38, 0x3d, 0x59, 0x4b, 0xdb, 0x5c, 0xde, 0x69, 0xbf, 0x3d, + 0x59, 0x7b, 0x38, 0xf2, 0xd8, 0x78, 0x36, 0xa8, 0xb9, 0x64, 0x5a, 0x5f, 0x24, 0x35, 0x1c, 0x9c, + 0x3d, 0xd7, 0x83, 0xa3, 0x51, 0x5d, 0x65, 0x51, 0x53, 0x7e, 0x76, 0x5a, 0x00, 0x77, 0x86, 0xe8, + 0x07, 0x58, 0x3d, 0xa4, 0x64, 0xda, 0xa7, 0x38, 0x98, 0x78, 0xae, 0xc3, 0x43, 0x2d, 0x95, 0xb5, + 0x6a, 0xa1, 0xb9, 0xa7, 0x42, 0x15, 0x36, 0x28, 0x99, 0xda, 0x52, 0x2b, 0x02, 0x7e, 0x7d, 0xbd, + 0x80, 0x91, 0xa7, 0x5d, 0x38, 0x8c, 0x01, 0x0d, 0xd1, 0x33, 0x28, 0x30, 0x12, 0x0f, 0x9b, 0x14, + 0x61, 0x77, 0x54, 0xd8, 0x5c, 0x8f, 0xfc, 0x1b, 0x41, 0x73, 0x8c, 0x9c, 0x85, 0x34, 0x41, 0x67, + 0x98, 0x4e, 0x4d, 0x5d, 0xd4, 0x52, 0xe7, 0x91, 0x6c, 0x21, 0x41, 0xb7, 0x20, 0xe5, 0x92, 0xe9, + 0xd4, 0x63, 0xe6, 0x72, 0x4c, 0xa7, 0x64, 0xa8, 0x04, 0xe9, 0x67, 0x33, 0x0f, 0x87, 0x2e, 0x36, + 0x53, 0x65, 0xad, 0x9a, 0x51, 0xea, 0x48, 0x88, 0xaa, 0x22, 0x15, 0x2f, 0xec, 0x4f, 0xb0, 0x43, + 0x7d, 0x4c, 0xcd, 0x74, 0xcc, 0x2a, 0xc7, 0x48, 0x27, 0xdc, 0x96, 0x8a, 0xca, 0x6f, 0x3a, 0x20, + 0xce, 0xf1, 0x0e, 0x0e, 0x43, 0x67, 0x84, 0x6d, 0xfc, 0x6c, 0x86, 0xc3, 0xff, 0x86, 0xe8, 0x1d, + 0xc8, 0xc7, 0x89, 0x16, 0x2c, 0xe7, 0xd6, 0x3f, 0xad, 0x9d, 0x0d, 0xc2, 0xb9, 0xea, 0xb5, 0x71, + 0xe8, 0x52, 0x2f, 0x60, 0x84, 0x46, 0x99, 0xc4, 0x08, 0x44, 0x1d, 0x80, 0x33, 0xfa, 0x04, 0x77, + 0xd7, 0x03, 0xcb, 0x2e, 0x88, 0x41, 0x75, 0x48, 0x4f, 0x65, 0x3d, 0x04, 0x33, 0xb9, 0xf5, 0xd5, + 0x9a, 0x9c, 0x99, 0x9a, 0x2a, 0x53, 0x54, 0x6f, 0x65, 0x15, 0xe7, 0x63, 0xf9, 0x32, 0x3e, 0x36, + 0x00, 0xc6, 0xd1, 0x10, 0x85, 0x66, 0xaa, 0x9c, 0xac, 0xe6, 0xd6, 0xcb, 0xb5, 0x0b, 0x13, 0x5f, + 0x7b, 0x67, 0xda, 0x14, 0x48, 0xcc, 0x13, 0xed, 0xc1, 0xea, 0xe2, 0xad, 0x4f, 0x71, 0x18, 0x84, + 0x66, 0xfa, 0x5a, 0x60, 0x2b, 0x0b, 0x77, 0x9b, 0x7b, 0xa3, 0xa7, 0xb0, 0x2a, 0x79, 0x0e, 0x99, + 0x43, 0x59, 0xff, 0x08, 0xcf, 0xcd, 0x4c, 0x59, 0xab, 0xe6, 0x9b, 0x5f, 0xbd, 0x3d, 0x59, 0xfb, + 0xf2, 0x7a, 0xfc, 0x6e, 0xe1, 0xb9, 0x5d, 0x10, 0x68, 0xfb, 0x1c, 0x6c, 0x0b, 0xcf, 0x2b, 0x03, + 0xf8, 0xe8, 0x62, 0x73, 0x35, 0x1d, 0xe6, 0x8e, 0xd1, 0x23, 0xc8, 0x50, 0xf9, 0x1e, 0x9a, 0x9a, + 0xc8, 0xe1, 0xce, 0xdf, 0xe4, 0x70, 0xce, 0x5b, 0x26, 0xb2, 0x70, 0xae, 0x74, 0xc1, 0x7c, 0xc7, + 0x2a, 0x0c, 0x88, 0x1f, 0xe2, 0x27, 0xbe, 0x47, 0x7c, 0x54, 0x83, 0x65, 0xb1, 0x3b, 0x45, 0x0f, + 0xe7, 0xd6, 0xcd, 0x4b, 0xda, 0xc1, 0xe2, 0x7a, 0x5b, 0x9a, 0x7d, 0xa3, 0x1f, 0xff, 0xbc, 0xa6, + 0x55, 0x7e, 0x5f, 0x82, 0x0f, 0x2e, 0x81, 0xfc, 0x9f, 0x0f, 0xc5, 0x23, 0x58, 0x9e, 0xf1, 0xa2, + 0xaa, 0x91, 0xf8, 0xfc, 0x7d, 0x6c, 0xc5, 0x78, 0x50, 0x60, 0xd2, 0xbf, 0xf2, 0x67, 0x0a, 0x56, + 0xf7, 0x7d, 0x27, 0x08, 0xc7, 0x84, 0x45, 0xfb, 0xa6, 0x01, 0xa9, 0x31, 0x76, 0x86, 0x38, 0x62, + 0xea, 0xde, 0x25, 0xe8, 0xe7, 0x7c, 0x6a, 0x9b, 0xc2, 0xc1, 0x56, 0x8e, 0xe8, 0x2e, 0x64, 0x8e, + 0x9e, 0xf7, 0x07, 0xbc, 0xb9, 0x44, 0xd5, 0xf2, 0xcd, 0x1c, 0x67, 0x66, 0xeb, 0x40, 0xf4, 0x9b, + 0x9d, 0x3e, 0x7a, 0x2e, 0x1b, 0x6f, 0x0d, 0x72, 0x13, 0x32, 0xea, 0x63, 0x9f, 0x51, 0x0f, 0x87, + 0x66, 0xb2, 0x9c, 0xac, 0xe6, 0x6d, 0x98, 0x90, 0x91, 0x25, 0x25, 0xa8, 0x08, 0xcb, 0x87, 0x9e, + 0xef, 0x4c, 0x44, 0xa2, 0xd1, 0x28, 0x4b, 0x51, 0xf1, 0x17, 0x1d, 0x52, 0x32, 0x2e, 0x7a, 0x0a, + 0x37, 0xf9, 0x52, 0xe8, 0xab, 0x1d, 0xd0, 0x57, 0x0d, 0xa9, 0x18, 0xbb, 0x56, 0x33, 0x23, 0x7a, + 0x71, 0x03, 0xdf, 0x06, 0x50, 0x93, 0xe9, 0xbd, 0xc0, 0x82, 0xb9, 0x64, 0xc4, 0x89, 0x9c, 0x31, + 0xef, 0x05, 0x46, 0x77, 0x20, 0xe7, 0x3a, 0x7e, 0x7f, 0x88, 0xdd, 0x89, 0xe7, 0xe3, 0x77, 0x3e, + 0x18, 0x5c, 0xc7, 0x6f, 0x4b, 0x39, 0xb2, 0x60, 0x59, 0x5c, 0x05, 0xc4, 0x72, 0xba, 0xbc, 0xb8, + 0x8b, 0x4b, 0x43, 0xd4, 0x0a, 0xfb, 0xdc, 0x21, 0x4a, 0x5e, 0x78, 0xa3, 0x1d, 0xc8, 0x04, 0xd4, + 0x23, 0xd4, 0x63, 0x73, 0x71, 0xec, 0xac, 0x5c, 0xda, 0x04, 0xe7, 0x69, 0xea, 0x2a, 0x97, 0x68, + 0x70, 0x23, 0x08, 0x0e, 0x17, 0x32, 0xea, 0x30, 0x3c, 0x9a, 0x8b, 0xf3, 0xe9, 0x6a, 0x70, 0xfb, + 0xca, 0x25, 0x82, 0x8b, 0x20, 0xd0, 0x06, 0xdc, 0x9a, 0xf9, 0xaa, 0xd3, 0x19, 0x1e, 0xf6, 0x19, + 0x9d, 0xf9, 0xf2, 0x49, 0xe6, 0x9e, 0x89, 0x15, 0xa7, 0x18, 0xb7, 0xec, 0x45, 0x86, 0x22, 0x65, + 0xd4, 0x00, 0x9d, 0xcd, 0x03, 0x6c, 0x66, 0xc5, 0x27, 0x7d, 0x76, 0x85, 0x4f, 0xea, 0xcd, 0x03, + 0xbc, 0x38, 0xbc, 0xe7, 0x01, 0x7e, 0xac, 0x67, 0x34, 0x63, 0xa9, 0xf2, 0x10, 0x32, 0x51, 0xee, + 0x28, 0x07, 0xe9, 0x27, 0xbb, 0x5b, 0xbb, 0x7b, 0xdf, 0xee, 0x1a, 0x09, 0x94, 0x87, 0x8c, 0x6d, + 0xb5, 0xf6, 0x0e, 0x2c, 0xfb, 0x3b, 0x43, 0x43, 0x05, 0xc8, 0xda, 0x56, 0xb3, 0xb1, 0xdd, 0xd8, + 0x6d, 0x59, 0xc6, 0x52, 0xc5, 0x84, 0x4c, 0x94, 0x22, 0x37, 0xdc, 0x3a, 0xe8, 0x37, 0x1b, 0xbd, + 0xd6, 0xa6, 0x91, 0xa8, 0x7c, 0x01, 0x3a, 0x8f, 0x84, 0x32, 0xa0, 0xdb, 0x8d, 0x8d, 0x9e, 0x91, + 0xe0, 0xa8, 0xdb, 0x56, 0xc3, 0xde, 0xb5, 0x6c, 0x43, 0x43, 0x2b, 0x00, 0x5d, 0xdb, 0xb2, 0x76, + 0xba, 0xbd, 0xce, 0x01, 0x07, 0xfa, 0x55, 0x03, 0xe3, 0xec, 0x4b, 0xd5, 0x0a, 0xdb, 0x84, 0x14, + 0xaf, 0xc6, 0x2c, 0x14, 0x73, 0xb6, 0xb2, 0x7e, 0xff, 0x1f, 0xd3, 0x93, 0x4e, 0xb5, 0x7d, 0xe1, + 0x11, 0x5d, 0x41, 0xa4, 0x3f, 0x3f, 0xf2, 0xa2, 0x33, 0x92, 0x77, 0x7c, 0xf6, 0xdc, 0x91, 0x58, + 0xe9, 0x40, 0x4a, 0xfa, 0x5d, 0xc8, 0xbd, 0xd1, 0x6a, 0x59, 0xdd, 0x9e, 0xd5, 0x36, 0x34, 0xae, + 0x6a, 0x74, 0xbb, 0xdb, 0x1d, 0xab, 0x6d, 0x2c, 0xa1, 0x2c, 0x2c, 0x5b, 0xb6, 0xbd, 0x67, 0x1b, + 0x49, 0x6e, 0xd5, 0xb6, 0x5a, 0xdb, 0x9d, 0x5d, 0xab, 0x6d, 0xe8, 0x8f, 0xf5, 0x4c, 0xd2, 0xd0, + 0x2b, 0x7d, 0xb8, 0xd1, 0x22, 0xfe, 0x61, 0x6b, 0xcc, 0xbb, 0xbf, 0x45, 0x7c, 0x86, 0x7f, 0x64, + 0xe8, 0x01, 0x00, 0xbf, 0x12, 0x39, 0xfe, 0x30, 0x5a, 0xca, 0xd9, 0xe6, 0x0d, 0xb5, 0x94, 0xb3, + 0x2d, 0xa9, 0xe9, 0xb4, 0xed, 0xac, 0x32, 0x12, 0x57, 0xae, 0x74, 0xe0, 0xcc, 0x27, 0xc4, 0x91, + 0xd7, 0xca, 0xbc, 0x1d, 0xbd, 0xae, 0xbf, 0xd2, 0x20, 0xbb, 0x33, 0x9b, 0x30, 0x8f, 0xcf, 0x29, + 0x9a, 0x80, 0x11, 0x9b, 0x57, 0xb9, 0x3a, 0xee, 0x5f, 0x6d, 0xa8, 0xb9, 0x6d, 0xf1, 0xee, 0xd5, + 0xf6, 0x63, 0x25, 0x51, 0xd5, 0x1e, 0x68, 0xe8, 0x29, 0xe4, 0xb9, 0x32, 0x2a, 0x3d, 0xaa, 0xbc, + 0xbf, 0xed, 0x8a, 0xb7, 0xaf, 0xc0, 0x9d, 0x84, 0x6f, 0xde, 0x3b, 0x7e, 0x5d, 0x4a, 0x1c, 0x9f, + 0x96, 0xb4, 0x97, 0xa7, 0x25, 0xed, 0xd5, 0x69, 0x49, 0xfb, 0xe3, 0xb4, 0xa4, 0xfd, 0xf4, 0xa6, + 0x94, 0x78, 0xf9, 0xa6, 0x94, 0x78, 0xf5, 0xa6, 0x94, 0xf8, 0x3e, 0xad, 0x10, 0xfe, 0x0a, 0x00, + 0x00, 0xff, 0xff, 0xd1, 0xe2, 0x72, 0x64, 0x9c, 0x0c, 0x00, 0x00, } diff --git a/pkg/storage/raft.proto b/pkg/storage/raft.proto index b008a8cada9f..3cad2bc76021 100644 --- a/pkg/storage/raft.proto +++ b/pkg/storage/raft.proto @@ -35,6 +35,16 @@ message RaftHeartbeat { optional uint64 term = 4 [(gogoproto.nullable) = false]; optional uint64 commit = 5 [(gogoproto.nullable) = false]; optional bool quiesce = 6 [(gogoproto.nullable) = false]; + + // ToIsLearner was added in v19.2 to aid in the transition from preemptive + // snapshots to learner replicas. If a Replica learns its ID from a message + // which indicates that it is a learner and it is not currently a part of the + // range (due to being from a preemptive snapshot) then it must delete all of + // its data. + // + // TODO(ajwerner): remove in 20.2 once we ensure that preemptive snapshots can + // no longer be present and that we're never talking to a 19.2 node. + optional bool to_is_learner = 7 [(gogoproto.nullable) = false]; } // RaftMessageRequest is the request used to send raft messages using our diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go index f6eaf43f0116..546cc0124db1 100644 --- a/pkg/storage/replica.go +++ b/pkg/storage/replica.go @@ -242,6 +242,7 @@ type Replica struct { syncutil.RWMutex // The destroyed status of a replica indicating if it's alive, corrupt, // scheduled for destruction or has been GCed. + // destroyStatus should only be set while also holding the raftMu. destroyStatus // Is the range quiescent? Quiescent ranges are not Tick()'d and unquiesce // whenever a Raft operation is performed. @@ -346,6 +347,7 @@ type Replica struct { // The minimum allowed ID for this replica. Initialized from // RaftTombstone.NextReplicaID. minReplicaID roachpb.ReplicaID + // The ID of the leader replica within the Raft group. Used to determine // when the leadership changes. leaderID roachpb.ReplicaID @@ -610,6 +612,14 @@ func (r *Replica) String() string { return fmt.Sprintf("[n%d,s%d,r%s]", r.store.Ident.NodeID, r.store.Ident.StoreID, &r.rangeStr) } +// ReplicaID returns the ID for the Replica. It may be zero if the replica does +// not know its ID. Once a Replica has a non-zero ReplicaID it will never change. +func (r *Replica) ReplicaID() roachpb.ReplicaID { + r.mu.RLock() + defer r.mu.RUnlock() + return r.mu.replicaID +} + // cleanupFailedProposal cleans up after a proposal that has failed. It // clears any references to the proposal and releases associated quota. // It requires that both Replica.mu and Replica.raftMu are exclusively held. diff --git a/pkg/storage/replica_application_result.go b/pkg/storage/replica_application_result.go index aa3a0116a841..0e0b8afcfcb9 100644 --- a/pkg/storage/replica_application_result.go +++ b/pkg/storage/replica_application_result.go @@ -293,22 +293,35 @@ func (r *Replica) handleComputeChecksumResult(ctx context.Context, cc *storagepb r.computeChecksumPostApply(ctx, *cc) } -func (r *Replica) handleChangeReplicasResult(ctx context.Context, chng *storagepb.ChangeReplicas) { - storeID := r.store.StoreID() - var found bool - for _, rDesc := range chng.Replicas() { - if rDesc.StoreID == storeID { - found = true - break - } +func (r *Replica) handleChangeReplicasResult( + ctx context.Context, chng *storagepb.ChangeReplicas, +) (changeRemovedReplica bool) { + // If this command removes us then we need to go through the process of + // removing our replica from the store. After this method returns the code + // should roughly return all the way up to whoever called handleRaftReady + // and this Replica should never be heard from again. We can detect if this + // change removed us by inspecting the replica's destroyStatus. We check the + // destroy status before processing a raft ready so if we find ourselves with + // removal pending at this point then we know that this command must be + // responsible. + if ds, _ := r.IsDestroyed(); ds != destroyReasonRemovalPending { + return false + } + if r.store.TestingKnobs().DisableEagerReplicaRemoval { + return true + } + if log.V(1) { + log.Infof(ctx, "removing replica due to ChangeReplicasTrigger: %v", chng) + } + if err := r.postDestroyRaftMuLocked(ctx, r.GetMVCCStats()); err != nil { + log.Fatalf(ctx, "failed to run Replica postDestroy: %v", err) } - if !found { - // This wants to run as late as possible, maximizing the chances - // that the other nodes have finished this command as well (since - // processing the removal from the queue looks up the Range at the - // lease holder, being too early here turns this into a no-op). - r.store.replicaGCQueue.AddAsync(ctx, r, replicaGCPriorityRemoved) + if err := r.store.removeReplicaImpl(ctx, r, chng.Desc.NextReplicaID, RemoveOptions{ + DestroyData: false, // We already destroyed the data when the batch committed. + }); err != nil { + log.Fatalf(ctx, "failed to remove replica: %v", err) } + return true } func (r *Replica) handleRaftLogDeltaResult(ctx context.Context, delta int64) { diff --git a/pkg/storage/replica_application_state_machine.go b/pkg/storage/replica_application_state_machine.go index ac8432388c09..08b58e82cb04 100644 --- a/pkg/storage/replica_application_state_machine.go +++ b/pkg/storage/replica_application_state_machine.go @@ -13,6 +13,7 @@ package storage import ( "context" "fmt" + "math" "time" "github.com/cockroachdb/cockroach/pkg/roachpb" @@ -375,6 +376,9 @@ type replicaAppBatch struct { // triggered a migration to the replica applied state key. If so, this // migration will be performed when the application batch is committed. migrateToAppliedStateKey bool + // changeRemovesReplica tracks whether the command in the batch (there must + // be only one) removes this replica from the range. + changeRemovesReplica bool // Statistics. entries int @@ -435,7 +439,25 @@ func (b *replicaAppBatch) Stage(cmdI apply.Command) (apply.CheckedCommand, error log.Event(ctx, "applying command") } - // Acquire the split or merge lock, if necessary. If a split or merge + // There's a nasty edge case here which only exists in 19.2 (for subtle + // reasons). + // + // Imagine we're catching up from a pre-emptive snapshot and we come across + // a merge, we're not gaurenteed that the RHS is going to be present. One + // option might be to destroy the current replica. Unfortunately we might + // also gotten raft messages and voted so we'll know our replica ID but our + // current view of the range descriptor will not include the current store. + // + // * One proposal is to just refuse to apply the current command and just spin + // handling raft ready objects but not actually applying any commands but + // detecting the illegal merge and truncating the CommittedEntries slice + // to before the merge prior to calling Advance. + // * The other is to permit destroying an initialized replica with + // nextReplicaID == Replica.mu.replicaID if the store is not in + // Replica.mu.state.Desc. We'd need to make sure to preserve the hard state + // in this case. + + // acquire the split or merge lock, if necessary. If a split or merge // command was rejected with a below-Raft forced error then its replicated // result was just cleared and this will be a no-op. if splitMergeUnlock, err := b.r.maybeAcquireSplitMergeLock(ctx, cmd.raftCmd); err != nil { @@ -514,6 +536,20 @@ func (b *replicaAppBatch) stageWriteBatch(ctx context.Context, cmd *replicatedCm return nil } +// changeRemovesStore returns true if any of the removals in this change have storeID. +func changeRemovesStore( + desc *roachpb.RangeDescriptor, change *storagepb.ChangeReplicas, storeID roachpb.StoreID, +) bool { + _, existsInDesc := desc.GetReplicaDescriptor(storeID) + // NB: if we're catching up from a preemptive snapshot then we won't + // exist in the current descriptor. + if !existsInDesc { + return false + } + _, existsInChange := change.Desc.GetReplicaDescriptor(storeID) + return !existsInChange +} + // runPreApplyTriggers runs any triggers that must fire before a command is // applied. It may modify the command's ReplicatedEvalResult. func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicatedCmd) error { @@ -560,16 +596,23 @@ func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicat if merge := res.Merge; merge != nil { // Merges require the subsumed range to be atomically deleted when the // merge transaction commits. + + // If our range currently has a non-zero replica ID then we know we're + // safe to commit this merge because of the invariants provided to us + // by the merge protocol. Namely if this committed we know that if the + // command committed then all of the replicas in the range descriptor + // are collocated when this command commits. If we do not have a non-zero + // replica ID then the logic in Stage should detect that and destroy our + // preemptive snapshot so we shouldn't ever get here. rhsRepl, err := b.r.store.GetReplica(merge.RightDesc.RangeID) if err != nil { return wrapWithNonDeterministicFailure(err, "unable to get replica for merge") } - const rangeIDLocalOnly = true - const mustClearRange = false + const mustUseClearRange = false if err := rhsRepl.preDestroyRaftMuLocked( - ctx, b.batch, b.batch, merge.RightDesc.NextReplicaID, rangeIDLocalOnly, mustClearRange, + ctx, b.batch, b.batch, roachpb.ReplicaID(math.MaxInt32), clearRangeIDLocalOnly, mustUseClearRange, ); err != nil { - return wrapWithNonDeterministicFailure(err, "unable to destroy range before merge") + return wrapWithNonDeterministicFailure(err, "unable to destroy replica before merge") } } @@ -597,6 +640,41 @@ func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicat } } + // Detect if this command will remove us from the range. + // If so we stage the removal of all of our range data into this batch. + // We'll complete the removal when it commits. Later logic detects the + // removal by inspecting the destroy status + if change := res.ChangeReplicas; change != nil && + changeRemovesStore(b.state.Desc, change, b.r.store.StoreID()) { + // Delete all of the local data. We're going to delete this hard state too. + // In order for this to be safe we need code above this to promise that we're + // never going to write hard state in response to a message for a later + // replica (with a different replica ID) to this range state. + // Furthermore we mark the replica as destroyed so that new commands are not + // accepted. The replica will be destroyed in handleChangeReplicas. + // Note that we must be holding the raftMu here because we're in the + // midst of application. + b.r.mu.Lock() + b.r.mu.destroyStatus.Set( + roachpb.NewRangeNotFoundError(b.r.RangeID, b.r.store.StoreID()), + destroyReasonRemovalPending) + b.r.mu.Unlock() + b.changeRemovesReplica = true + const mustUseRangeDeletionTombstone = true + if !b.r.store.TestingKnobs().DisableEagerReplicaRemoval { + if err := b.r.preDestroyRaftMuLocked( + ctx, + b.batch, + b.batch, + change.Desc.NextReplicaID, + clearAll, + mustUseRangeDeletionTombstone, + ); err != nil { + return wrapWithNonDeterministicFailure(err, "unable to destroy replica before removal") + } + } + } + // Provide the command's corresponding logical operations to the Replica's // rangefeed. Only do so if the WriteBatch is non-nil, in which case the // rangefeed requires there to be a corresponding logical operation log or @@ -609,6 +687,7 @@ func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicat } else if cmd.raftCmd.LogicalOpLog != nil { log.Fatalf(ctx, "non-nil logical op log with nil write batch: %v", cmd.raftCmd) } + return nil } @@ -676,7 +755,8 @@ func (b *replicaAppBatch) ApplyToStateMachine(ctx context.Context) error { // the applied state is stored in this batch, ensure that if the batch ends // up not being durably committed then the entries in this batch will be // applied again upon startup. - const sync = false + //debugBatch(ctx, b.batch) + sync := b.changeRemovesReplica if err := b.batch.Commit(sync); err != nil { return wrapWithNonDeterministicFailure(err, "unable to commit Raft entry batch") } @@ -729,6 +809,10 @@ func (b *replicaAppBatch) ApplyToStateMachine(ctx context.Context) error { // batch's RocksDB batch. This records the highest raft and lease index that // have been applied as of this batch. It also records the Range's mvcc stats. func (b *replicaAppBatch) addAppliedStateKeyToBatch(ctx context.Context) error { + if b.changeRemovesReplica { + log.Infof(ctx, "change removes replica") + return nil + } loader := &b.r.raftMu.stateLoader if b.migrateToAppliedStateKey { // A Raft command wants us to begin using the RangeAppliedState key @@ -857,14 +941,18 @@ func (sm *replicaStateMachine) ApplySideEffects( // before notifying a potentially waiting client. clearTrivialReplicatedEvalResultFields(cmd.replicatedResult()) if !cmd.IsTrivial() { - shouldAssert := sm.handleNonTrivialReplicatedEvalResult(ctx, *cmd.replicatedResult()) + shouldAssert, isRemoved := sm.handleNonTrivialReplicatedEvalResult(ctx, *cmd.replicatedResult()) + + if isRemoved { + return nil, apply.ErrRemoved + } // NB: Perform state assertion before acknowledging the client. // Some tests (TestRangeStatsInit) assumes that once the store has started // and the first range has a lease that there will not be a later hard-state. if shouldAssert { + sm.r.mu.Lock() // Assert that the on-disk state doesn't diverge from the in-memory // state as a result of the side effects. - sm.r.mu.Lock() sm.r.assertStateLocked(ctx, sm.r.store.Engine()) sm.r.mu.Unlock() sm.stats.stateAssertions++ @@ -923,7 +1011,7 @@ func (sm *replicaStateMachine) ApplySideEffects( // to pass a replicatedResult that does not imply any side-effects. func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( ctx context.Context, rResult storagepb.ReplicatedEvalResult, -) (shouldAssert bool) { +) (shouldAssert, isRemoved bool) { // Assert that this replicatedResult implies at least one side-effect. if rResult.Equal(storagepb.ReplicatedEvalResult{}) { log.Fatalf(ctx, "zero-value ReplicatedEvalResult passed to handleNonTrivialReplicatedEvalResult") @@ -955,7 +1043,7 @@ func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( // we want to assert that these two states do not diverge. shouldAssert = !rResult.Equal(storagepb.ReplicatedEvalResult{}) if !shouldAssert { - return false + return false, sm.batch.changeRemovesReplica } if rResult.Split != nil { @@ -995,7 +1083,7 @@ func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( } if rResult.ChangeReplicas != nil { - sm.r.handleChangeReplicasResult(ctx, rResult.ChangeReplicas) + isRemoved = sm.r.handleChangeReplicasResult(ctx, rResult.ChangeReplicas) rResult.ChangeReplicas = nil } @@ -1007,7 +1095,7 @@ func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( if !rResult.Equal(storagepb.ReplicatedEvalResult{}) { log.Fatalf(ctx, "unhandled field in ReplicatedEvalResult: %s", pretty.Diff(rResult, storagepb.ReplicatedEvalResult{})) } - return true + return true, isRemoved } func (sm *replicaStateMachine) maybeApplyConfChange(ctx context.Context, cmd *replicatedCmd) error { @@ -1023,10 +1111,17 @@ func (sm *replicaStateMachine) maybeApplyConfChange(ctx context.Context, cmd *re // to raft. return nil } - return sm.r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { + // If we're removed then we know that must have happened due to this + // command and so we're happy to not apply this conf change. + isRemoved, err := sm.r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { raftGroup.ApplyConfChange(cmd.confChange.ConfChangeI) return true, nil }) + if isRemoved { + _, err = sm.r.IsDestroyed() + err = wrapWithNonDeterministicFailure(err, "failed to apply config change because replica was already removed") + } + return err default: panic("unexpected") } diff --git a/pkg/storage/replica_destroy.go b/pkg/storage/replica_destroy.go index 0fb60c307472..efd25b4de1b2 100644 --- a/pkg/storage/replica_destroy.go +++ b/pkg/storage/replica_destroy.go @@ -12,6 +12,7 @@ package storage import ( "context" + "fmt" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" @@ -29,7 +30,10 @@ type DestroyReason int const ( // The replica is alive. destroyReasonAlive DestroyReason = iota - // The replica has been marked for GC, but hasn't been GCed yet. + // The replica is in the process of being removed but has not been removed + // yet. It exists to avoid races between two threads which may decide to + // destroy a replica (e.g. processing a ChangeRelicasTrigger removing the + // range and receiving a raft message with a higher replica ID). destroyReasonRemovalPending // The replica has been GCed. destroyReasonRemoved @@ -43,15 +47,15 @@ type destroyStatus struct { err error } +func (s destroyStatus) String() string { + return fmt.Sprintf("{%v %d}", s.err, s.reason) +} + func (s *destroyStatus) Set(err error, reason DestroyReason) { s.err = err s.reason = reason } -func (s *destroyStatus) Reset() { - s.Set(nil, destroyReasonAlive) -} - // IsAlive returns true when a replica is alive. func (s destroyStatus) IsAlive() bool { return s.reason == destroyReasonAlive @@ -62,16 +66,22 @@ func (s destroyStatus) Removed() bool { return s.reason == destroyReasonRemoved } +// RemovalPending returns whether the replica is removed or in the process of +// being removed. +func (s destroyStatus) RemovalPending() bool { + return s.reason == destroyReasonRemovalPending || s.reason == destroyReasonRemoved +} + func (r *Replica) preDestroyRaftMuLocked( ctx context.Context, reader engine.Reader, writer engine.Writer, nextReplicaID roachpb.ReplicaID, - rangeIDLocalOnly bool, + clearOpt clearRangeOption, mustClearRange bool, ) error { desc := r.Desc() - err := clearRangeData(desc, reader, writer, rangeIDLocalOnly, mustClearRange) + err := clearRangeData(desc, reader, writer, clearOpt, mustClearRange) if err != nil { return err } @@ -114,6 +124,53 @@ func (r *Replica) postDestroyRaftMuLocked(ctx context.Context, ms enginepb.MVCCS return nil } +// destroyUninitializedReplicaRaftMuLocked is called when we know that an +// uninitialized replica (which knows its replica ID but not its key range) has +// been removed and re-added as a different replica (with a new replica +// ID). We're safe to GC its hard state because nobody cares about our votes +// anymore. We can't GC the range's data because we don't know where it is. +// Fortunately this isn't a problem because the range must not have data +// because a replica must apply a snapshot before having data and if we had +// applied a snapshot then we'd be initialized. This replica may have been +// created in anticipation of a split in which case we'll clear its data when +// the split trigger is applied. +func (r *Replica) destroyUninitializedReplicaRaftMuLocked( + ctx context.Context, nextReplicaID roachpb.ReplicaID, +) error { + batch := r.Engine().NewWriteOnlyBatch() + defer batch.Close() + + // Clear the range ID local data including the hard state. + // We don't know about any user data so we can't clear any user data. + // See the comment on this method for why this is safe. + if err := r.preDestroyRaftMuLocked( + ctx, + r.Engine(), + batch, + nextReplicaID, + clearRangeIDLocalOnly, + false, /* mustClearRange */ + ); err != nil { + return err + } + + // We need to sync here because we are potentially deleting sideloaded + // proposals from the file system next. We could write the tombstone only in + // a synchronous batch first and then delete the data alternatively, but + // then need to handle the case in which there is both the tombstone and + // leftover replica data. + if err := batch.Commit(true); err != nil { + return err + } + + if r.raftMu.sideloaded != nil { + if err := r.raftMu.sideloaded.Clear(ctx); err != nil { + log.Warningf(ctx, "failed to remove sideload storage for %v: %v", r, err) + } + } + return nil +} + // destroyRaftMuLocked deletes data associated with a replica, leaving a // tombstone. func (r *Replica) destroyRaftMuLocked(ctx context.Context, nextReplicaID roachpb.ReplicaID) error { @@ -128,7 +185,7 @@ func (r *Replica) destroyRaftMuLocked(ctx context.Context, nextReplicaID roachpb r.Engine(), batch, nextReplicaID, - false, /* rangeIDLocalOnly */ + clearAll, false, /* mustClearRange */ ); err != nil { return err diff --git a/pkg/storage/replica_gc_queue.go b/pkg/storage/replica_gc_queue.go index 33d64df45dca..27878c876758 100644 --- a/pkg/storage/replica_gc_queue.go +++ b/pkg/storage/replica_gc_queue.go @@ -114,13 +114,13 @@ func newReplicaGCQueue(store *Store, db *client.DB, gossip *gossip.Gossip) *repl // in the past. func (rgcq *replicaGCQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, _ *config.SystemConfig, -) (bool, float64) { +) (shouldQ bool, prio float64) { + lastCheck, err := repl.GetLastReplicaGCTimestamp(ctx) if err != nil { log.Errorf(ctx, "could not read last replica GC timestamp: %+v", err) return false, 0 } - if _, currentMember := repl.Desc().GetReplicaDescriptor(repl.store.StoreID()); !currentMember { return true, replicaGCPriorityRemoved } @@ -215,6 +215,11 @@ func (rgcq *replicaGCQueue) process( } replyDesc := rs[0] + repl.mu.RLock() + replicaID := repl.mu.replicaID + ticks := repl.mu.ticks + repl.mu.RUnlock() + // Now check whether the replica is meant to still exist. // Maybe it was deleted "under us" by being moved. currentDesc, currentMember := replyDesc.GetReplicaDescriptor(repl.store.StoreID()) @@ -234,11 +239,6 @@ func (rgcq *replicaGCQueue) process( // We are no longer a member of this range, but the range still exists. // Clean up our local data. - repl.mu.RLock() - replicaID := repl.mu.replicaID - ticks := repl.mu.ticks - repl.mu.RUnlock() - if replicaID == 0 { // This is a preemptive replica. GC'ing a preemptive replica is a // good idea if and only if the up-replication that it was a part of @@ -284,13 +284,24 @@ func (rgcq *replicaGCQueue) process( rgcq.metrics.RemoveReplicaCount.Inc(1) log.VEventf(ctx, 1, "destroying local data") + + nextReplicaID := replyDesc.NextReplicaID + if currentMember { + // If we're a member of the current range descriptor then we must not put + // down a tombstone at replyDesc.NextReplicaID as that would prevent us + // from getting a snapshot and becoming a part of the rang.e + nextReplicaID = currentDesc.ReplicaID + } // Note that this seems racy - we didn't hold any locks between reading // the range descriptor above and deciding to remove the replica - but // we pass in the NextReplicaID to detect situations in which the // replica became "non-gc'able" in the meantime by checking (with raftMu // held throughout) whether the replicaID is still smaller than the - // NextReplicaID. - if err := repl.store.RemoveReplica(ctx, repl, replyDesc.NextReplicaID, RemoveOptions{ + // NextReplicaID. Given non-zero replica IDs don't change this is only + // possible if we currently think we're processing a pre-emptive snapshot + // but discover in RemoveReplica that this range has since been added and + // knows that. + if err := repl.store.RemoveReplica(ctx, repl, nextReplicaID, RemoveOptions{ DestroyData: true, }); err != nil { return err @@ -328,13 +339,6 @@ func (rgcq *replicaGCQueue) process( } } - // We don't have the last NextReplicaID for the subsumed range, nor can we - // obtain it, but that's OK: we can just be conservative and use the maximum - // possible replica ID. store.RemoveReplica will write a tombstone using - // this maximum possible replica ID, which would normally be problematic, as - // it would prevent this store from ever having a new replica of the removed - // range. In this case, however, it's copacetic, as subsumed ranges _can't_ - // have new replicas. const nextReplicaID = math.MaxInt32 if err := repl.store.RemoveReplica(ctx, repl, nextReplicaID, RemoveOptions{ DestroyData: true, diff --git a/pkg/storage/replica_init.go b/pkg/storage/replica_init.go index b28a3a33591b..e8deb1f9181d 100644 --- a/pkg/storage/replica_init.go +++ b/pkg/storage/replica_init.go @@ -150,26 +150,25 @@ func (r *Replica) initRaftMuLockedReplicaMuLocked( } r.rangeStr.store(replicaID, r.mu.state.Desc) r.connectionClass.set(rpc.ConnectionClassForKey(desc.StartKey)) - if err := r.setReplicaIDRaftMuLockedMuLocked(replicaID); err != nil { - return err + if r.mu.replicaID == 0 { + if err := r.setReplicaIDRaftMuLockedMuLocked(ctx, replicaID); err != nil { + return err + } + } else if r.mu.replicaID != replicaID { + log.Fatalf(ctx, "attempting to initialize a replica which has ID %d with ID %d", + r.mu.replicaID, replicaID) } - r.assertStateLocked(ctx, r.store.Engine()) return nil } -func (r *Replica) setReplicaID(replicaID roachpb.ReplicaID) error { - r.raftMu.Lock() - defer r.raftMu.Unlock() - r.mu.Lock() - defer r.mu.Unlock() - return r.setReplicaIDRaftMuLockedMuLocked(replicaID) -} - -func (r *Replica) setReplicaIDRaftMuLockedMuLocked(replicaID roachpb.ReplicaID) error { - if r.mu.replicaID == replicaID { +func (r *Replica) setReplicaIDRaftMuLockedMuLocked( + ctx context.Context, replicaID roachpb.ReplicaID, +) error { + if r.mu.replicaID != 0 { + log.Fatalf(ctx, "cannot set replica ID from anything other than 0, currently %d", + r.mu.replicaID) // The common case: the replica ID is unchanged. - return nil } if replicaID == 0 { // If the incoming message does not have a new replica ID it is a @@ -183,17 +182,11 @@ func (r *Replica) setReplicaIDRaftMuLockedMuLocked(replicaID roachpb.ReplicaID) if r.mu.replicaID > replicaID { return errors.Errorf("replicaID cannot move backwards from %d to %d", r.mu.replicaID, replicaID) } - - if r.mu.destroyStatus.reason == destroyReasonRemovalPending { - // An earlier incarnation of this replica was removed, but apparently it has been re-added - // now, so reset the status. - r.mu.destroyStatus.Reset() + if r.mu.destroyStatus.RemovalPending() { + // This replica has been marked for removal and we're trying to resurrect it. + log.Fatalf(ctx, "cannot resurect replica %d", r.mu.replicaID) } - // if r.mu.replicaID != 0 { - // // TODO(bdarnell): clean up previous raftGroup (update peers) - // } - // Initialize or update the sideloaded storage. If the sideloaded storage // already exists (which is iff the previous replicaID was non-zero), then // we have to move the contained files over (this corresponds to the case in @@ -220,24 +213,13 @@ func (r *Replica) setReplicaIDRaftMuLockedMuLocked(replicaID roachpb.ReplicaID) return errors.Wrap(err, "while initializing sideloaded storage") } - previousReplicaID := r.mu.replicaID r.mu.replicaID = replicaID + r.mu.minReplicaID = replicaID + 1 - if replicaID >= r.mu.minReplicaID { - r.mu.minReplicaID = replicaID + 1 - } - // Reset the raft group to force its recreation on next usage. - r.mu.internalRaftGroup = nil - - // If there was a previous replica, repropose its pending commands under - // this new incarnation. - if previousReplicaID != 0 { - if log.V(1) { - log.Infof(r.AnnotateCtx(context.TODO()), "changed replica ID from %d to %d", - previousReplicaID, replicaID) - } - // repropose all pending commands under new replicaID. - r.refreshProposalsLocked(0, reasonReplicaIDChanged) + // Sanity check that we do not already have a raft group as we did not + // know our replica ID before this call. + if r.mu.internalRaftGroup != nil { + log.Fatalf(ctx, "somehow had an initialized raft group on a zero valued replica") } return nil @@ -270,8 +252,9 @@ func (r *Replica) maybeInitializeRaftGroup(ctx context.Context) { // If this replica hasn't initialized the Raft group, create it and // unquiesce and wake the leader to ensure the replica comes up to date. initialized := r.mu.internalRaftGroup != nil + removed := !r.mu.destroyStatus.IsAlive() r.mu.RUnlock() - if initialized { + if initialized || removed { return } @@ -281,7 +264,9 @@ func (r *Replica) maybeInitializeRaftGroup(ctx context.Context) { r.mu.Lock() defer r.mu.Unlock() - if err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { + // If we raced on checking the destroyStatus above that's fine as + // the below withRaftGroupLocked will no-op. + if _, err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { return true, nil }); err != nil { log.VErrEventf(ctx, 1, "unable to initialize raft group: %s", err) diff --git a/pkg/storage/replica_learner_test.go b/pkg/storage/replica_learner_test.go index 6f2a5ff6825a..b8d3564f7a0e 100644 --- a/pkg/storage/replica_learner_test.go +++ b/pkg/storage/replica_learner_test.go @@ -429,13 +429,25 @@ func TestReplicaGCQueueSeesLearnerOrJointConfig(t *testing.T) { require.Contains(t, tracing.FormatRecordedSpans(trace), msg) return tc.LookupRangeOrFatal(t, scratchStartKey) } - desc := checkNoGC() // Make sure it didn't collect the learner. require.NotEmpty(t, desc.Replicas().Learners()) // Now get the range into a joint config. tc.RemoveReplicasOrFatal(t, scratchStartKey, tc.Target(1)) // remove learner + // We need to wait until the LHS range has been GC'd on server 1 before the + // RHS can be successfully added. Wait for the RHS to exist. + testutils.SucceedsSoon(t, func() error { + s := tc.Server(1) + firstStore, err := s.GetStores().(*storage.Stores).GetStore(s.GetFirstStoreID()) + require.NoError(t, err) + repl := firstStore.LookupReplica(roachpb.RKey(scratchStartKey)) + if repl != nil { + return fmt.Errorf("replica has not yet been GC'd") + } + return nil + }) + ltk.withStopAfterJointConfig(func() { desc = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) require.Len(t, desc.Replicas().Filter(predIncoming), 1, desc) @@ -549,7 +561,7 @@ func TestLearnerAdminChangeReplicasRace(t *testing.T) { // that the descriptor has changed since the AdminChangeReplicas command // started. close(blockSnapshotsCh) - if err := g.Wait(); !testutils.IsError(err, `descriptor changed`) { + if err := g.Wait(); !testutils.IsError(err, `descriptor changed|raft group deleted`) { t.Fatalf(`expected "descriptor changed" error got: %+v`, err) } desc = tc.LookupRangeOrFatal(t, scratchStartKey) diff --git a/pkg/storage/replica_proposal_buf.go b/pkg/storage/replica_proposal_buf.go index c60a48d35816..5235d5b0e5ca 100644 --- a/pkg/storage/replica_proposal_buf.go +++ b/pkg/storage/replica_proposal_buf.go @@ -618,12 +618,16 @@ func (rp *replicaProposer) enqueueUpdateCheck() { func (rp *replicaProposer) withGroupLocked(fn func(*raft.RawNode) error) error { // Pass true for mayCampaignOnWake because we're about to propose a command. - return (*Replica)(rp).withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { + isRemoved, err := (*Replica)(rp).withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { // We're proposing a command here so there is no need to wake the leader // if we were quiesced. However, we should make sure we are unquiesced. (*Replica)(rp).unquiesceLocked() return false /* unquiesceLocked */, fn(raftGroup) }) + if isRemoved { + err = rp.mu.destroyStatus.err + } + return err } func (rp *replicaProposer) registerProposalLocked(p *ProposalData) { diff --git a/pkg/storage/replica_raft.go b/pkg/storage/replica_raft.go index 52b577da569c..bbaeb2cfcf8c 100644 --- a/pkg/storage/replica_raft.go +++ b/pkg/storage/replica_raft.go @@ -382,7 +382,7 @@ func (r *Replica) hasPendingProposalsRLocked() bool { // stepRaftGroup calls Step on the replica's RawNode with the provided request's // message. Before doing so, it assures that the replica is unquiesced and ready // to handle the request. -func (r *Replica) stepRaftGroup(req *RaftMessageRequest) error { +func (r *Replica) stepRaftGroup(req *RaftMessageRequest) (isRemoved bool, _ error) { // We're processing an incoming raft message (from a batch that may // include MsgVotes), so don't campaign if we wake up our raft // group. @@ -445,14 +445,12 @@ func (r *Replica) handleRaftReadyRaftMuLocked( var hasReady bool var rd raft.Ready r.mu.Lock() - lastIndex := r.mu.lastIndex // used for append below lastTerm := r.mu.lastTerm raftLogSize := r.mu.raftLogSize leaderID := r.mu.leaderID lastLeaderID := leaderID - - err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { + isRemoved, err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { if err := r.mu.proposalBuf.FlushLockedWithRaftGroup(raftGroup); err != nil { return false, err } @@ -466,6 +464,10 @@ func (r *Replica) handleRaftReadyRaftMuLocked( const expl = "while checking raft group for Ready" return stats, expl, errors.Wrap(err, expl) } + // If we've been removed then return. + if isRemoved { + return stats, "", nil + } if !hasReady { // We must update the proposal quota even if we don't have a ready. @@ -723,7 +725,6 @@ func (r *Replica) handleRaftReadyRaftMuLocked( // Might have gone negative if node was recently restarted. raftLogSize = 0 } - } // Update protected state - last index, last term, raft log size, and raft @@ -756,10 +757,18 @@ func (r *Replica) handleRaftReadyRaftMuLocked( applicationStart := timeutil.Now() if len(rd.CommittedEntries) > 0 { - if err := appTask.ApplyCommittedEntries(ctx); err != nil { + err := appTask.ApplyCommittedEntries(ctx) + stats.applyCommittedEntriesStats = sm.moveStats() + switch err { + case nil: + case apply.ErrRemoved: + // We know that our replica has been removed. We also know that we've + // stepped the state machine up to the point where it's been removed. + // TODO(ajwerner): decide if there's more to be done here. + return stats, "", err + default: return stats, err.(*nonDeterministicFailure).safeExpl, err } - stats.applyCommittedEntriesStats = sm.moveStats() // etcd raft occasionally adds a nil entry (our own commands are never // empty). This happens in two situations: When a new leader is elected, and @@ -789,11 +798,12 @@ func (r *Replica) handleRaftReadyRaftMuLocked( r.mu.Unlock() } - // TODO(bdarnell): need to check replica id and not Advance if it - // has changed. Or do we need more locking to guarantee that replica - // ID cannot change during handleRaftReady? + // NB: if we just processed a command which removed this replica from the + // raft group we will early return before this point. This, combined with + // the fact that we'll refuse to process messages intended for a higher + // replica ID ensures that our replica ID could not have changed. const expl = "during advance" - if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { + isRemoved, err = r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { raftGroup.Advance(rd) // If the Raft group still has more to process then we immediately @@ -804,9 +814,15 @@ func (r *Replica) handleRaftReadyRaftMuLocked( r.store.enqueueRaftUpdateCheck(r.RangeID) } return true, nil - }); err != nil { + }) + if err != nil { return stats, expl, errors.Wrap(err, expl) } + // If we removed ourselves during this call we would have early returned + // above with the apply.ErrRemoved. + if isRemoved { + return stats, expl, errors.Errorf("replica unexpectedly removed") + } // NB: All early returns other than the one due to not having a ready // which also makes the below call are due to fatal errors. @@ -1032,6 +1048,7 @@ func (r *Replica) maybeCoalesceHeartbeat( Term: msg.Term, Commit: msg.Commit, Quiesce: quiesce, + ToIsLearner: toReplica.GetType() == roachpb.LEARNER, } if log.V(4) { log.Infof(ctx, "coalescing beat: %+v", beat) @@ -1153,11 +1170,11 @@ func (r *Replica) sendRaftMessage(ctx context.Context, msg raftpb.Message) { Message: msg, RangeStartKey: startKey, // usually nil }) { - if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { + if isRemoved, err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { r.mu.droppedMessages++ raftGroup.ReportUnreachable(msg.To) return true, nil - }); err != nil { + }); !isRemoved && err != nil { log.Fatal(ctx, err) } } @@ -1197,10 +1214,10 @@ func (r *Replica) reportSnapshotStatus(ctx context.Context, to roachpb.ReplicaID snapStatus = raft.SnapshotFailure } - if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { + if isRemoved, err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { raftGroup.ReportSnapshot(uint64(to), snapStatus) return true, nil - }); err != nil { + }); !isRemoved && err != nil { log.Fatal(ctx, err) } } @@ -1322,19 +1339,22 @@ func (s pendingCmdSlice) Less(i, j int) bool { // varies. // // Requires that Replica.mu is held. +// +// If this Replica is in the process of being removed this method will return +// true and a nil error. func (r *Replica) withRaftGroupLocked( mayCampaignOnWake bool, f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error), -) error { - if r.mu.destroyStatus.Removed() { +) (isRemoved bool, _ error) { + if r.mu.destroyStatus.RemovalPending() { // Silently ignore all operations on destroyed replicas. We can't return an // error here as all errors returned from this method are considered fatal. - return nil + return true, nil } if r.mu.replicaID == 0 { // The replica's raft group has not yet been configured (i.e. the replica // was created from a preemptive snapshot). - return nil + return false, nil } if r.mu.internalRaftGroup == nil { @@ -1347,7 +1367,7 @@ func (r *Replica) withRaftGroupLocked( &raftLogger{ctx: ctx}, )) if err != nil { - return err + return false, err } r.mu.internalRaftGroup = raftGroup @@ -1364,7 +1384,7 @@ func (r *Replica) withRaftGroupLocked( if unquiesce { r.unquiesceAndWakeLeaderLocked() } - return err + return false, err } // withRaftGroup calls the supplied function with the (lazily initialized) @@ -1378,9 +1398,12 @@ func (r *Replica) withRaftGroupLocked( // should not initiate an election while handling incoming raft // messages (which may include MsgVotes from an election in progress, // and this election would be disrupted if we started our own). +// +// If this Replica is in the process of being removed this method will return +// true and a nil error. func (r *Replica) withRaftGroup( mayCampaignOnWake bool, f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error), -) error { +) (isRemoved bool, _ error) { r.mu.Lock() defer r.mu.Unlock() return r.withRaftGroupLocked(mayCampaignOnWake, f) @@ -1550,7 +1573,8 @@ func (r *Replica) maybeAcquireSplitMergeLock( func (r *Replica) acquireSplitLock( ctx context.Context, split *roachpb.SplitTrigger, ) (func(), error) { - rightRng, _, err := r.store.getOrCreateReplica(ctx, split.RightDesc.RangeID, 0, nil) + rightRng, _, err := r.store.getOrCreateReplica(ctx, split.RightDesc.RangeID, + 0 /* replicaID */, nil /* creatingReplica */, false /* isLearner */) if err != nil { return nil, err } @@ -1572,7 +1596,14 @@ func (r *Replica) acquireMergeLock( // right-hand replica in place, any snapshots for the right-hand range will // block on raftMu, waiting for the merge to complete, after which the replica // will realize it has been destroyed and reject the snapshot. - rightRepl, _, err := r.store.getOrCreateReplica(ctx, merge.RightDesc.RangeID, 0, nil) + // + // There's one edge case to this requirement: the current Replica may be + // catching up from a pre-emptive snapshot. This case only arises in v19.2 + // while performing the upgrade. If we detect that we're not a part of this + // range then it's not safe to apply this merge. In that case we need to + // remove the LHS replica synchronously. + rightRepl, _, err := r.store.getOrCreateReplica(ctx, merge.RightDesc.RangeID, + 0 /* replicaID */, nil /* creatingReplica */, false /* isLearner */) if err != nil { return nil, err } diff --git a/pkg/storage/replica_raftstorage.go b/pkg/storage/replica_raftstorage.go index 7132eae707e0..aa0e62b93de2 100644 --- a/pkg/storage/replica_raftstorage.go +++ b/pkg/storage/replica_raftstorage.go @@ -678,6 +678,14 @@ func (r *Replica) updateRangeInfo(desc *roachpb.RangeDescriptor) error { return nil } +type clearRangeOption int + +const ( + clearRangeIDLocalOnly clearRangeOption = iota + clearReplicatedOnly + clearAll +) + // clearRangeData clears the data associated with a range descriptor. If // rangeIDLocalOnly is true, then only the range-id local keys are deleted. // Otherwise, the range-id local keys, range local keys, and user keys are all @@ -689,16 +697,20 @@ func clearRangeData( desc *roachpb.RangeDescriptor, eng engine.Reader, writer engine.Writer, - rangeIDLocalOnly bool, + opt clearRangeOption, mustClearRange bool, ) error { var keyRanges []rditer.KeyRange - if rangeIDLocalOnly { - keyRanges = []rditer.KeyRange{rditer.MakeRangeIDLocalKeyRange(desc.RangeID, false)} - } else { + switch opt { + case clearRangeIDLocalOnly: + keyRanges = []rditer.KeyRange{ + rditer.MakeRangeIDLocalKeyRange(desc.RangeID, false), + } + case clearReplicatedOnly: + keyRanges = rditer.MakeReplicatedKeyRanges(desc) + case clearAll: keyRanges = rditer.MakeAllKeyRanges(desc) } - var clearRangeFn func(engine.Reader, engine.Writer, engine.MVCCKey, engine.MVCCKey) error if mustClearRange { clearRangeFn = func(eng engine.Reader, writer engine.Writer, start, end engine.MVCCKey) error { @@ -1013,7 +1025,7 @@ func (r *Replica) clearSubsumedReplicaDiskData( r.store.Engine(), &subsumedReplSST, subsumedNextReplicaID, - true, /* rangeIDLocalOnly */ + clearRangeIDLocalOnly, true, /* mustClearRange */ ); err != nil { subsumedReplSST.Close() diff --git a/pkg/storage/replica_test.go b/pkg/storage/replica_test.go index 05490d143468..bcbf3a983dea 100644 --- a/pkg/storage/replica_test.go +++ b/pkg/storage/replica_test.go @@ -7278,117 +7278,6 @@ func TestSyncSnapshot(t *testing.T) { } } -// TestReplicaIDChangePending verifies that on a replica ID change, pending -// commands are re-proposed on the new raft group. -func TestReplicaIDChangePending(t *testing.T) { - defer leaktest.AfterTest(t)() - - tc := testContext{} - cfg := TestStoreConfig(nil) - // Disable ticks to avoid automatic reproposals after a timeout, which - // would pass this test. - cfg.RaftTickInterval = math.MaxInt32 - stopper := stop.NewStopper() - defer stopper.Stop(context.TODO()) - tc.StartWithStoreConfig(t, stopper, cfg) - repl := tc.repl - - // Stop the command from being proposed to the raft group and being removed. - proposedOnOld := make(chan struct{}, 1) - repl.mu.Lock() - repl.mu.proposalBuf.testing.submitProposalFilter = func(*ProposalData) (drop bool, _ error) { - select { - case proposedOnOld <- struct{}{}: - default: - } - return true, nil - } - lease := *repl.mu.state.Lease - repl.mu.Unlock() - - // Add a command to the pending list and wait for it to be proposed. - magicTS := tc.Clock().Now() - ba := roachpb.BatchRequest{} - ba.Timestamp = magicTS - ba.Add(&roachpb.PutRequest{ - RequestHeader: roachpb.RequestHeader{ - Key: roachpb.Key("a"), - }, - Value: roachpb.MakeValueFromBytes([]byte("val")), - }) - _, _, _, err := repl.evalAndPropose(context.Background(), lease, &ba, &allSpans, endCmds{}) - if err != nil { - t.Fatal(err) - } - <-proposedOnOld - - // Set the raft command handler so we can tell if the command has been - // re-proposed. - proposedOnNew := make(chan struct{}, 1) - repl.mu.Lock() - repl.mu.proposalBuf.testing.submitProposalFilter = func(p *ProposalData) (drop bool, _ error) { - if p.Request.Timestamp == magicTS { - select { - case proposedOnNew <- struct{}{}: - default: - } - } - return false, nil - } - repl.mu.Unlock() - - // Set the ReplicaID on the replica. - if err := repl.setReplicaID(2); err != nil { - t.Fatal(err) - } - - <-proposedOnNew -} - -func TestSetReplicaID(t *testing.T) { - defer leaktest.AfterTest(t)() - - tsc := TestStoreConfig(nil) - tc := testContext{} - stopper := stop.NewStopper() - defer stopper.Stop(context.TODO()) - tc.StartWithStoreConfig(t, stopper, tsc) - - repl := tc.repl - - testCases := []struct { - replicaID roachpb.ReplicaID - minReplicaID roachpb.ReplicaID - newReplicaID roachpb.ReplicaID - expectedMinReplicaID roachpb.ReplicaID - expectedErr string - }{ - {0, 0, 1, 2, ""}, - {0, 1, 1, 2, ""}, - {0, 2, 1, 2, "raft group deleted"}, - {1, 2, 1, 2, ""}, // not an error; replicaID == newReplicaID is checked first - {2, 0, 1, 0, "replicaID cannot move backwards"}, - } - for _, c := range testCases { - t.Run("", func(t *testing.T) { - repl.mu.Lock() - repl.mu.replicaID = c.replicaID - repl.mu.minReplicaID = c.minReplicaID - repl.mu.Unlock() - - err := repl.setReplicaID(c.newReplicaID) - repl.mu.Lock() - if repl.mu.minReplicaID != c.expectedMinReplicaID { - t.Errorf("expected minReplicaID=%d, but found %d", c.expectedMinReplicaID, repl.mu.minReplicaID) - } - repl.mu.Unlock() - if !testutils.IsError(err, c.expectedErr) { - t.Fatalf("expected %q, but found %v", c.expectedErr, err) - } - }) - } -} - func TestReplicaRetryRaftProposal(t *testing.T) { defer leaktest.AfterTest(t)() diff --git a/pkg/storage/split_delay_helper.go b/pkg/storage/split_delay_helper.go index 7f864f1f77b4..f91b4057e8ef 100644 --- a/pkg/storage/split_delay_helper.go +++ b/pkg/storage/split_delay_helper.go @@ -46,7 +46,7 @@ func (sdh *splitDelayHelper) RaftStatus(ctx context.Context) (roachpb.RangeID, * func (sdh *splitDelayHelper) ProposeEmptyCommand(ctx context.Context) { r := (*Replica)(sdh) r.raftMu.Lock() - _ = r.withRaftGroup(true /* campaignOnWake */, func(rawNode *raft.RawNode) (bool, error) { + _, _ = r.withRaftGroup(true /* campaignOnWake */, func(rawNode *raft.RawNode) (bool, error) { // NB: intentionally ignore the error (which can be ErrProposalDropped // when there's an SST inflight). data := encodeRaftCommand(raftVersionStandard, makeIDKey(), nil) diff --git a/pkg/storage/store.go b/pkg/storage/store.go index 99fa19230a5a..0f0b4bf36103 100644 --- a/pkg/storage/store.go +++ b/pkg/storage/store.go @@ -35,6 +35,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/sqlutil" + "github.com/cockroachdb/cockroach/pkg/storage/apply" "github.com/cockroachdb/cockroach/pkg/storage/batcheval" "github.com/cockroachdb/cockroach/pkg/storage/closedts/container" "github.com/cockroachdb/cockroach/pkg/storage/closedts/ctpb" @@ -1142,7 +1143,7 @@ func (s *Store) IsStarted() bool { return atomic.LoadInt32(&s.started) == 1 } -// IterateIDPrefixKeys helps visit system keys that use RangeID prefixing ( such as +// IterateIDPrefixKeys helps visit system keys that use RangeID prefixing (such as // RaftHardStateKey, RaftTombstoneKey, and many others). Such keys could in principle exist at any // RangeID, and this helper efficiently discovers all the keys of the desired type (as specified by // the supplied `keyFn`) and, for each key-value pair discovered, unmarshals it into `msg` and then @@ -2117,38 +2118,20 @@ func splitPostApply( rightRng.mu.Lock() // The right hand side of the split may have been removed and re-added // in the meantime, and the replicaID in RightDesc may be stale. - // Consequently the call below may fail with a RaftGroupDeletedError. In - // general, this protects earlier incarnations of the replica that were + // In general, this protects earlier incarnations of the replica that were // since replicaGC'ed from reneging on promises made earlier (for // example, once the HardState is removed, a replica could cast a // different vote for the same term). // - // It is safe to circumvent that in this case because the RHS must have - // remained uninitialized (the LHS blocks all user data, and since our - // Raft logs start at a nonzero index a snapshot must go through before - // any log entries are appended). This means the data in that range is - // just a HardState which we "logically" snapshot by assigning it data - // formerly located within the LHS. + // The RHS must have remained uninitialized (the LHS blocks all user data, + // and since our Raft logs start at a nonzero index a snapshot must go + // through before any log entries are appended). This means the data in + // that range is just a HardState which we "logically" snapshot by + // assigning it data formerly located within the LHS. // - // Note that if we ever have a way to replicaGC uninitialized replicas, - // the RHS may have been gc'ed and so the HardState would be gone. In - // that case, the requirement that the HardState remains would have been - // violated if the bypass below were used, which is why we place an - // assertion. - // - // See: - // https://github.com/cockroachdb/cockroach/issues/21146#issuecomment-365757329 - // - // TODO(tbg): this argument is flawed - it's possible for a tombstone - // to exist on the RHS: - // https://github.com/cockroachdb/cockroach/issues/40470 - // Morally speaking, this means that we should throw away the data we - // moved from the LHS to the RHS (depending on the tombstone). - // Realistically speaking it will probably be easier to create the RHS - // anyway, even though there's a tombstone and it may just get gc'ed - // again. Note that for extra flavor, we may not even know whether the - // RHS is currently supposed to exist or not, lending more weight to the - // second approach. + // We detect this case and pass the old range descriptor for the RHS + // to SpitRange below which will clear the RHS data rather than installing + // the RHS in the store. tombstoneKey := keys.RaftTombstoneKey(rightRng.RangeID) var tombstone roachpb.RaftTombstone if ok, err := engine.MVCCGetProto( @@ -2156,49 +2139,56 @@ func splitPostApply( ); err != nil { log.Fatalf(ctx, "unable to load tombstone for RHS: %+v", err) } else if ok { - log.Fatalf(ctx, "split trigger found right-hand side with tombstone %+v: %v", tombstone, rightRng) + log.Warningf(ctx, "split trigger found right-hand side with tombstone %+v, "+ + "RHS must have been removed at this ID: %v", tombstone, rightRng) } rightDesc, ok := split.RightDesc.GetReplicaDescriptor(r.StoreID()) if !ok { - // This is yet another special quirky case. The local store is not - // necessarily a member of the split; this can occur if this store - // wasn't a member at the time of the split, but is nevertheless - // catching up across the split. For example, add a learner, and - // while it is being caught up via a snapshot, remove the learner - // again, then execute a split, and re-add it. Upon being re-added - // the learner will likely catch up from where the snapshot left it, - // and it will see itself get removed, then we hit this branch when - // the split trigger is applied, and eventually there's a - // ChangeReplicas that re-adds the local store under a new - // replicaID. - // - // So our trigger will have a smaller replicaID for our RHS, which - // will blow up in initRaftMuLockedReplicaMuLocked. We still want - // to force the RHS to accept the descriptor, even though that - // rewinds the replicaID. To do that we want to change the existing - // replicaID, but we didn't find one -- zero is then the only reasonable - // choice. Note that this is also the replicaID a replica that is - // not reflected in its own descriptor will have, i.e. we're cooking - // up less of a special case here than you'd expect at first glance. - // - // Note that futzing with the replicaID is a high-risk operation as - // it is what the raft peer will believe itself to be identified by. - // Under no circumstances must we use a replicaID that belongs to - // someone else, or a byzantine situation will result. Zero is - // special-cased and will never init a raft group until the real - // ID is known from inbound raft traffic. - rightDesc.ReplicaID = 0 // for clarity only; it's already zero + // The only time that this case can happen is if we are currently not + // a part of this range (i.e. catching up from a preemptive snapshot). + // In this case it's fine and the logic below is reasonable. + _, lhsExists := r.mu.state.Desc.GetReplicaDescriptor(r.StoreID()) + if lhsExists { + log.Fatalf(ctx, "we can't have had the local store change replica ID: %d %v", r.mu.replicaID, rightRng) + } } + // If we're in this case then we know that the RHS has since been removed + // and re-added with a higher replica ID. We know we've never processed a + // snapshot for the right range because up to this point it would overlap + // with the left and ranges cannot move rightwards. Furthermore if it didn't + // overlap because the LHS used to be smaller because it still hasn't + // processed a merge then there's no way we could possibly be processing + // this split. + // + // We might however have already voted at a higher term. In general + // this shouldn't happen because we add learners and then promote them + // only after we snapshot but for the sake of mixed version clusters and + // other potential behavior changes we pass the desc for the known to be + // old RHS and have SplitRange clear its data. This call to SplitRange + // short circuits the rest of this call which would set up the RHS. if rightRng.mu.replicaID > rightDesc.ReplicaID { - rightRng.mu.replicaID = rightDesc.ReplicaID + rightRng.mu.Unlock() + err := r.store.SplitRange(ctx, r, rightRng, split.LeftDesc, &split.RightDesc) + if err != nil { + log.Fatal(ctx, err) + } + return + } + // If we are uninitialized and have a zero-value replica ID then we're safe + // to ignore this tombstone because we couldn't have ever promised anything + // as the newer replica. This is true because we would have needed a + // snapshot we couldn't possibly have received. If the tombsone does not + // exceed the rightDesc.ReplicaID then we have no need to reset + // minReplicaID. This case can occur if we created the RHS due to raft + // messages before the LHS acquired the split lock and then the RHS + // discovered that it was removed and re-added at a higher replica ID which + // lead to a tombstone being laid down but then the node forgot about the + // higher replica ID. + if tombstone.NextReplicaID > rightDesc.ReplicaID { + rightRng.mu.minReplicaID = 0 } - // NB: the safety argument above implies that we don't have to worry - // about restoring the existing minReplicaID if it's nonzero. No - // promises have been made, so none need to be kept. So we clear this - // unconditionally, making sure that it doesn't block us from init'ing - // the RHS. - rightRng.mu.minReplicaID = 0 err := rightRng.initRaftMuLockedReplicaMuLocked(&split.RightDesc, r.store.Clock(), 0) + rightRng.mu.Unlock() if err != nil { log.Fatal(ctx, err) @@ -2237,10 +2227,17 @@ func splitPostApply( // until it receives a Raft message addressed to the right-hand range. But // since new replicas start out quiesced, unless we explicitly awaken the // Raft group, there might not be any Raft traffic for quite a while. - if err := rightRng.withRaftGroup(true, func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error) { + rightRngIsRemoved, err := rightRng.withRaftGroup(true, func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error) { return true, nil - }); err != nil { + }) + if err != nil { log.Fatalf(ctx, "unable to create raft group for right-hand range in split: %+v", err) + } else if rightRngIsRemoved { + // This case should not be possible because the destroyStatus for a replica + // can only change while holding the raftMu and in order to have reached + // this point we know that we must be holding the raftMu for the RHS + // because we acquired it beneath raft. + log.Fatalf(ctx, "unable to create raft group for right-hand range in split: range is removed") } // Invoke the leasePostApply method to ensure we properly initialize @@ -2251,7 +2248,7 @@ func splitPostApply( // Add the RHS replica to the store. This step atomically updates // the EndKey of the LHS replica and also adds the RHS replica // to the store's replica map. - if err := r.store.SplitRange(ctx, r, rightRng, split.LeftDesc); err != nil { + if err := r.store.SplitRange(ctx, r, rightRng, split.LeftDesc, nil /* oldRightDesc */); err != nil { // Our in-memory state has diverged from the on-disk state. log.Fatalf(ctx, "%s: failed to update Store after split: %+v", r, err) } @@ -2289,11 +2286,23 @@ func splitPostApply( // // This is only called from the split trigger in the context of the execution // of a Raft command. +// +// The funky case is if rightDesc is non-nil. If so then we know that the right +// replica has been removed and re-added before applying this split and we need +// to leave it uninitialized. It's not possible for it to have become +// initialized as it would have needed a snapshot which must have overlapped the +// leftRepl. The user data in oldRightDesc will be cleared. func (s *Store) SplitRange( - ctx context.Context, leftRepl, rightRepl *Replica, newLeftDesc roachpb.RangeDescriptor, + ctx context.Context, + leftRepl, rightRepl *Replica, + newLeftDesc roachpb.RangeDescriptor, + oldRightDesc *roachpb.RangeDescriptor, ) error { oldLeftDesc := leftRepl.Desc() - rightDesc := rightRepl.Desc() + rightDesc := oldRightDesc + if rightDesc == nil { + rightDesc = rightRepl.Desc() + } if !bytes.Equal(oldLeftDesc.EndKey, rightDesc.EndKey) || bytes.Compare(oldLeftDesc.StartKey, rightDesc.StartKey) >= 0 { @@ -2308,11 +2317,18 @@ func (s *Store) SplitRange( if exRng != rightRepl { log.Fatalf(ctx, "found unexpected uninitialized replica: %s vs %s", exRng, rightRepl) } - // NB: We only remove from uninitReplicas and the replicaQueues maps here - // so that we don't leave open a window where a replica is temporarily not - // present in Store.mu.replicas. - delete(s.mu.uninitReplicas, rightDesc.RangeID) - s.replicaQueues.Delete(int64(rightDesc.RangeID)) + // If oldRightDesc is not nil then we know that exRng refers to a later + // replica in the RHS range and should remain uninitialized. + if oldRightDesc == nil { + // NB: We only remove from uninitReplicas and the replicaQueues maps here + // so that we don't leave open a window where a replica is temporarily not + // present in Store.mu.replicas. + delete(s.mu.uninitReplicas, rightDesc.RangeID) + s.replicaQueues.Delete(int64(rightDesc.RangeID)) + } + } else if oldRightDesc != nil { + log.Fatalf(ctx, "found initialized replica despite knowing that the post "+ + "split replica has been removed") } leftRepl.setDesc(ctx, &newLeftDesc) @@ -2336,22 +2352,40 @@ func (s *Store) SplitRange( // Clear the original range's request stats, since they include requests for // spans that are now owned by the new range. leftRepl.leaseholderStats.resetRequestCounts() - leftRepl.writeStats.splitRequestCounts(rightRepl.writeStats) + if oldRightDesc == nil { + leftRepl.writeStats.splitRequestCounts(rightRepl.writeStats) - if err := s.addReplicaInternalLocked(rightRepl); err != nil { - return errors.Errorf("unable to add replica %v: %s", rightRepl, err) - } + if err := s.addReplicaInternalLocked(rightRepl); err != nil { + return errors.Errorf("unable to add replica %v: %s", rightRepl, err) + } - // Update the replica's cached byte thresholds. This is a no-op if the system - // config is not available, in which case we rely on the next gossip update - // to perform the update. - if err := rightRepl.updateRangeInfo(rightRepl.Desc()); err != nil { - return err + // Update the replica's cached byte thresholds. This is a no-op if the system + // config is not available, in which case we rely on the next gossip update + // to perform the update. + if err := rightRepl.updateRangeInfo(rightRepl.Desc()); err != nil { + return err + } + // Add the range to metrics and maybe gossip on capacity change. + s.metrics.ReplicaCount.Inc(1) + s.maybeGossipOnCapacityChange(ctx, rangeAddEvent) + } else { + if rightRepl.IsInitialized() { + log.Fatalf(ctx, "refusing to clear replicated data for an initialized range %v in SplitRange", rightRepl) + } + // We need to clear the data which the RHS would have inherited. + // The uninitialized RHS doesn't know about this data so we must clear it + // lest it be leaked potentially forever. + batch := rightRepl.Engine().NewWriteOnlyBatch() + defer batch.Close() + if err := clearRangeData(oldRightDesc, rightRepl.Engine(), batch, clearReplicatedOnly, false); err != nil { + log.Fatalf(ctx, "failed to clear range data for removed rhs: %v", err) + } + if err := batch.Commit(true); err != nil { + log.Fatalf(ctx, "failed to clear range data for removed rhs: %v", err) + } + return nil } - // Add the range to metrics and maybe gossip on capacity change. - s.metrics.ReplicaCount.Inc(1) - s.maybeGossipOnCapacityChange(ctx, rangeAddEvent) return nil } @@ -2580,7 +2614,12 @@ func (s *Store) removeReplicaImpl( // We check both rep.mu.ReplicaID and rep.mu.state.Desc's replica ID because // they can differ in cases when a replica's ID is increased due to an // incoming raft message (see #14231 for background). + // TODO(ajwerner): reconsider some of this sanity checking. rep.mu.Lock() + if rep.mu.destroyStatus.Removed() { + rep.mu.Unlock() + return nil + } replicaID := rep.mu.replicaID if rep.mu.replicaID >= nextReplicaID { rep.mu.Unlock() @@ -2600,12 +2639,7 @@ func (s *Store) removeReplicaImpl( } if !rep.IsInitialized() { - // The split trigger relies on the fact that it can bypass the tombstone - // check for the RHS, but this is only true as long as we never delete - // its HardState. - // - // See the comment in splitPostApply for details. - log.Fatalf(ctx, "can not replicaGC uninitialized replicas") + log.Fatalf(ctx, "can not replicaGC uninitialized replicas in this method") } // During merges, the context might have the subsuming range, so we explicitly @@ -2666,7 +2700,74 @@ func (s *Store) removeReplicaImpl( // TODO(peter): Could release s.mu.Lock() here. s.maybeGossipOnCapacityChange(ctx, rangeRemoveEvent) s.scanner.RemoveReplica(rep) + return nil +} + +func (s *Store) removeUninitializedReplicaRaftMuLocked( + ctx context.Context, rep *Replica, nextReplicaID roachpb.ReplicaID, +) error { + rep.raftMu.AssertHeld() + + // Sanity check this removal. + rep.mu.RLock() + ds := rep.mu.destroyStatus + isInitialized := rep.isInitializedRLocked() + rep.mu.RUnlock() + // Somebody already removed this Replica. + if ds.Removed() { + return nil + } + + if !ds.RemovalPending() { + log.Fatalf(ctx, "cannot remove uninitialized replica which is not removal pending: %v", ds) + } + + // When we're in this state we should have already had our destroy status set + // so it shouldn't have been possible to process any raft messages or apply + // any snapshots. + if isInitialized { + log.Fatalf(ctx, "previously uninitialized replica became initialized before removal") + } + + // Proceed with the removal. + rep.readOnlyCmdMu.Lock() + rep.mu.Lock() + rep.cancelPendingCommandsLocked() + rep.mu.internalRaftGroup = nil + rep.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(rep.RangeID, rep.store.StoreID()), destroyReasonRemoved) + rep.mu.Unlock() + rep.readOnlyCmdMu.Unlock() + + if err := rep.destroyUninitializedReplicaRaftMuLocked(ctx, nextReplicaID); err != nil { + log.Fatalf(ctx, "failed to remove uninitialized replica %v: %v", rep, err) + } + + s.mu.Lock() + defer s.mu.Unlock() + + // Sanity check, could be removed. + value, stillExists := s.mu.replicas.Load(int64(rep.RangeID)) + if !stillExists { + log.Fatalf(ctx, "uninitialized replica was removed in the meantime") + } + existing := (*Replica)(value) + if existing == rep { + log.Infof(ctx, "removing uninitialized replica %v", rep) + } else { + log.Fatalf(ctx, "uninitialized replica %v was already removed", rep) + } + + s.metrics.ReplicaCount.Dec(1) + // Only an uninitialized replica can have a placeholder since, by + // definition, an initialized replica will be present in the + // replicasByKey map. While the replica will usually consume the + // placeholder itself, that isn't guaranteed and so this invocation + // here is crucial (i.e. don't remove it). + if s.removePlaceholderLocked(ctx, rep.RangeID) { + atomic.AddInt32(&s.counts.droppedPlaceholders, 1) + } + s.unlinkReplicaByRangeIDLocked(rep.RangeID) return nil } @@ -3255,6 +3356,10 @@ func (s *Store) HandleSnapshot( }) } +// learnerReplicaType exists to avoid allocating when marking a replica as a +// learner in uncoalesceBeats. +var learnerReplicaType = roachpb.LEARNER + func (s *Store) uncoalesceBeats( ctx context.Context, beats []RaftHeartbeat, @@ -3292,8 +3397,8 @@ func (s *Store) uncoalesceBeats( Message: msg, Quiesce: beat.Quiesce, } - if log.V(4) { - log.Infof(ctx, "uncoalesced beat: %+v", beatReqs[i]) + if beat.ToIsLearner { + beatReqs[i].ToReplica.Type = &learnerReplicaType } if err := s.HandleRaftUncoalescedRequest(ctx, &beatReqs[i], respStream); err != nil { @@ -3378,6 +3483,7 @@ func (s *Store) withReplicaForRequest( req.RangeID, req.ToReplica.ReplicaID, &req.FromReplica, + req.ToReplica.GetType() == roachpb.LEARNER, ) if err != nil { return roachpb.NewError(err) @@ -3428,7 +3534,11 @@ func (s *Store) processRaftRequestWithReplica( drop := maybeDropMsgApp(ctx, (*replicaMsgAppDropper)(r), &req.Message, req.RangeStartKey) if !drop { - if err := r.stepRaftGroup(req); err != nil { + isRemoved, err := r.stepRaftGroup(req) + if isRemoved { + _, err = r.IsDestroyed() + } + if err != nil { return roachpb.NewError(err) } } @@ -3501,10 +3611,13 @@ func (s *Store) processRaftSnapshotRequest( }() } - if err := r.stepRaftGroup(&snapHeader.RaftMessageRequest); err != nil { + isRemoved, err := r.stepRaftGroup(&snapHeader.RaftMessageRequest) + if isRemoved { + _, err = r.IsDestroyed() + } + if err != nil { return roachpb.NewError(err) } - if _, expl, err := r.handleRaftReadyRaftMuLocked(ctx, inSnap); err != nil { fatalOnRaftReadyErr(ctx, expl, err) } @@ -3543,33 +3656,40 @@ func (s *Store) HandleRaftResponse(ctx context.Context, resp *RaftMessageRespons repl.raftMu.Lock() defer repl.raftMu.Unlock() repl.mu.Lock() - defer repl.mu.Unlock() // If the replica ID in the error does not match then we know // that the replica has been removed and re-added quickly. In // that case, we don't want to add it to the replicaGCQueue. - if tErr.ReplicaID != repl.mu.replicaID { - log.Infof(ctx, "replica too old response with old replica ID: %s", tErr.ReplicaID) + // If the replica is not alive then we also should ignore this error. + if tErr.ReplicaID != repl.mu.replicaID || !repl.mu.destroyStatus.IsAlive() { + repl.mu.Unlock() return nil } - // If the replica ID in the error does match, we know the replica - // will be removed and we can cancel any pending commands. This is - // sometimes necessary to unblock PushTxn operations that are - // necessary for the replica GC to succeed. - repl.cancelPendingCommandsLocked() - // The replica will be garbage collected soon (we are sure // since our replicaID is definitely too old), but in the meantime we // already want to bounce all traffic from it. Note that the replica - // could be re-added with a higher replicaID, in which this error is - // cleared in setReplicaIDRaftMuLockedMuLocked. - if repl.mu.destroyStatus.IsAlive() { - storeID := repl.store.StoreID() - repl.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(repl.RangeID, storeID), destroyReasonRemovalPending) + // could be re-added with a higher replicaID, but we want to clear the + // replica's data before that happens. + if log.V(1) { + log.Infof(ctx, "setting local replica to destroyed due to ReplicaTooOld error: %v", repl) } - s.replicaGCQueue.AddAsync(ctx, repl, replicaGCPriorityRemoved) + storeID := repl.store.StoreID() + // NB: We know that there's a later copy of this range on this store but + // to introduce another more specific error in this case is overkill so + // we return RangeNotFoundError which the recipient will properly + // ignore. + repl.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(repl.RangeID, storeID), + destroyReasonRemovalPending) + repl.mu.Unlock() + nextReplicaID := tErr.ReplicaID + 1 + if !repl.IsInitialized() { + return s.removeUninitializedReplicaRaftMuLocked(ctx, repl, nextReplicaID) + } + return s.removeReplicaImpl(ctx, repl, nextReplicaID, RemoveOptions{ + DestroyData: true, + }) case *roachpb.RaftGroupDeletedError: if replErr != nil { // RangeNotFoundErrors are expected here; nothing else is. @@ -3632,7 +3752,11 @@ func (s *Store) processRequestQueue(ctx context.Context, rangeID roachpb.RangeID // giving up the lock. Set lastRepl to nil, so we don't handle it // down below as well. lastRepl = nil - if _, expl, err := r.handleRaftReadyRaftMuLocked(ctx, noSnap); err != nil { + switch _, expl, err := r.handleRaftReadyRaftMuLocked(ctx, noSnap); err { + case nil: + case apply.ErrRemoved: + // return hopefully never to be heard from again. + default: fatalOnRaftReadyErr(ctx, expl, err) } } @@ -3677,7 +3801,11 @@ func (s *Store) processRequestQueue(ctx context.Context, rangeID roachpb.RangeID // handleRaftReadyRaftMuLocked) since racing to handle Raft Ready won't // have any undesirable results. ctx = lastRepl.AnnotateCtx(ctx) - if _, expl, err := lastRepl.handleRaftReady(ctx, noSnap); err != nil { + switch _, expl, err := lastRepl.handleRaftReady(ctx, noSnap); err { + case nil: + case apply.ErrRemoved: + // return hopefully never to be heard from again. + default: fatalOnRaftReadyErr(ctx, expl, err) } } @@ -3693,8 +3821,12 @@ func (s *Store) processReady(ctx context.Context, rangeID roachpb.RangeID) { ctx = r.AnnotateCtx(ctx) start := timeutil.Now() stats, expl, err := r.handleRaftReady(ctx, noSnap) - if err != nil { - log.Fatalf(ctx, "%s: %+v", log.Safe(expl), err) // TODO(bdarnell) + switch err { + case nil: + case apply.ErrRemoved: + return + default: + fatalOnRaftReadyErr(ctx, expl, err) } elapsed := timeutil.Since(start) s.metrics.RaftWorkingDurationNanos.Inc(elapsed.Nanoseconds()) @@ -3936,13 +4068,20 @@ func (s *Store) getOrCreateReplica( rangeID roachpb.RangeID, replicaID roachpb.ReplicaID, creatingReplica *roachpb.ReplicaDescriptor, + isLearner bool, ) (_ *Replica, created bool, _ error) { + r := retry.Start(retry.Options{ + InitialBackoff: time.Microsecond, + MaxBackoff: 10 * time.Millisecond, + }) for { + r.Next() r, created, err := s.tryGetOrCreateReplica( ctx, rangeID, replicaID, creatingReplica, + isLearner, ) if err == errRetry { continue @@ -3965,33 +4104,99 @@ func (s *Store) tryGetOrCreateReplica( rangeID roachpb.RangeID, replicaID roachpb.ReplicaID, creatingReplica *roachpb.ReplicaDescriptor, -) (_ *Replica, created bool, _ error) { - // The common case: look up an existing (initialized) replica. - if value, ok := s.mu.replicas.Load(int64(rangeID)); ok { - repl := (*Replica)(value) - repl.raftMu.Lock() // not unlocked - repl.mu.Lock() - defer repl.mu.Unlock() - - var replTooOldErr error - if creatingReplica != nil { + isLearner bool, +) (_ *Replica, created bool, err error) { + var ( + removeReplica = func(repl *Replica) error { + repl.mu.destroyStatus.Set(err, destroyReasonRemovalPending) + isInitialized := repl.isInitializedRLocked() + repl.mu.Unlock() + defer repl.mu.Lock() + if !isInitialized { + if err := s.removeUninitializedReplicaRaftMuLocked(ctx, repl, replicaID); err != nil { + log.Fatalf(ctx, "failed to remove uninitialized replica: %v", err) + } + } else { + if err := s.removeReplicaImpl(ctx, repl, replicaID, RemoveOptions{ + DestroyData: true, + }); err != nil { + log.Fatal(ctx, err) + } + } + return errRetry + } + handleFromReplicaTooOld = func(repl *Replica) error { + if creatingReplica == nil { + return nil + } // Drop messages that come from a node that we believe was once a member of // the group but has been removed. desc := repl.mu.state.Desc _, found := desc.GetReplicaDescriptorByID(creatingReplica.ReplicaID) // It's not a current member of the group. Is it from the past? if !found && creatingReplica.ReplicaID < desc.NextReplicaID { - replTooOldErr = roachpb.NewReplicaTooOldError(creatingReplica.ReplicaID) + return roachpb.NewReplicaTooOldError(creatingReplica.ReplicaID) + } + return nil + } + handleToReplicaTooOld = func(repl *Replica) error { + if replicaID == 0 || repl.mu.replicaID == 0 || repl.mu.replicaID >= replicaID { + return nil + } + if log.V(1) { + log.Infof(ctx, "found message for newer replica ID: %v %v %v %v %v", repl.mu.replicaID, replicaID, repl.mu.minReplicaID, repl.mu.state.Desc, &repl.mu.destroyStatus) } + return removeReplica(repl) + } + handleNonVoterWithPreemptiveSnapshot = func(repl *Replica) error { + if replicaID == 0 && isLearner { + log.Fatalf(ctx, "cannot have zero replicaID and be a learner") + } + // replicaID must be non-zero if isLearner. + if repl.mu.replicaID != 0 || !repl.isInitializedRLocked() || !isLearner { + return nil + } + if log.V(1) { + log.Infof(ctx, "found message for replica ID %v as non-voter but currently not part of the range, destroying preemptive snapshot", replicaID) + } + return removeReplica(repl) + } + ) + // The common case: look up an existing (initialized) replica. + if value, ok := s.mu.replicas.Load(int64(rangeID)); ok { + repl := (*Replica)(value) + repl.raftMu.Lock() // not unlocked on success + repl.mu.Lock() + defer repl.mu.Unlock() + if err := handleFromReplicaTooOld(repl); err != nil { + repl.raftMu.Unlock() + return nil, false, err + } + if repl.mu.destroyStatus.RemovalPending() { + repl.raftMu.Unlock() + return nil, false, errRetry + } + if err := handleToReplicaTooOld(repl); err != nil { + repl.raftMu.Unlock() + return nil, false, err + } + if err := handleNonVoterWithPreemptiveSnapshot(repl); err != nil { + repl.raftMu.Unlock() + return nil, false, err } var err error - if replTooOldErr != nil { - err = replTooOldErr - } else if ds := repl.mu.destroyStatus; ds.reason == destroyReasonRemoved { - err = errRetry - } else { - err = repl.setReplicaIDRaftMuLockedMuLocked(replicaID) + if repl.mu.replicaID == 0 && replicaID != 0 { + // This message is telling us about our replica ID. + // This is a common case when dealing with preemptive snapshots if we're + // initialized. + err = repl.setReplicaIDRaftMuLockedMuLocked(repl.AnnotateCtx(ctx), replicaID) + } else if replicaID != 0 && repl.mu.replicaID > replicaID { + // TODO(ajwerner): probably just silently drop this message. + err = roachpb.NewRangeNotFoundError(rangeID, s.StoreID()) + } else if replicaID != 0 && repl.mu.replicaID != replicaID { + log.Fatalf(ctx, "we should never update the replica id based on a message %v %v", + repl.mu.replicaID, replicaID) } if err != nil { repl.raftMu.Unlock() @@ -4050,7 +4255,7 @@ func (s *Store) tryGetOrCreateReplica( repl.raftMu.Unlock() return nil, false, err } else if hs.Commit != 0 { - log.Fatalf(ctx, "found non-zero HardState.Commit on uninitialized replica %s. HS=%+v", repl, hs) + log.Fatalf(ctx, "found non-zero HardState.Commit on uninitialized replica %s. HS=%+v %v", repl, hs, repl.mu.stateLoader.RaftHardStateKey()) } desc := &roachpb.RangeDescriptor{ diff --git a/pkg/storage/store_snapshot.go b/pkg/storage/store_snapshot.go index 3fb7ff8b74d5..9c138f8846ec 100644 --- a/pkg/storage/store_snapshot.go +++ b/pkg/storage/store_snapshot.go @@ -628,7 +628,9 @@ func (s *Store) canApplySnapshotLocked( existingRepl.raftMu.AssertHeld() existingRepl.mu.RLock() - existingIsInitialized := existingRepl.isInitializedRLocked() + existingDesc := existingRepl.mu.state.Desc + existingIsInitialized := existingDesc.IsInitialized() + existingDestroyStatus := existingRepl.mu.destroyStatus existingRepl.mu.RUnlock() if existingIsInitialized { @@ -637,15 +639,19 @@ func (s *Store) canApplySnapshotLocked( // in Replica.maybeAcquireSnapshotMergeLock for how this is // made safe. // - // NB: we expect the replica to know its replicaID at this point - // (i.e. !existingIsPreemptive), though perhaps it's possible - // that this isn't true if the leader initiates a Raft snapshot - // (that would provide a range descriptor with this replica in - // it) but this node reboots (temporarily forgetting its - // replicaID) before the snapshot arrives. + // NB: The snapshot must be intended for this replica as + // withReplicaForRequest ensures that requests with a non-zero replica + // id are passed to a replica with a matching id. Given this is not a + // preemptive snapshot we know that its id must be non-zero. return nil, nil } + // If we are not alive then we should not apply a snapshot as our removal + // is imminent. + if existingDestroyStatus.RemovalPending() { + return nil, existingDestroyStatus.err + } + // We have a key range [desc.StartKey,desc.EndKey) which we want to apply a // snapshot for. Is there a conflicting existing placeholder or an // overlapping range? @@ -670,7 +676,7 @@ func (s *Store) checkSnapshotOverlapLocked( // NB: this check seems redundant since placeholders are also represented in // replicasByKey (and thus returned in getOverlappingKeyRangeLocked). if exRng, ok := s.mu.replicaPlaceholders[desc.RangeID]; ok { - return errors.Errorf("%s: canApplySnapshotLocked: cannot add placeholder, have an existing placeholder %s", s, exRng) + return errors.Errorf("%s: canApplySnapshotLocked: cannot add placeholder, have an existing placeholder %s %v", s, exRng, snapHeader.RaftMessageRequest.FromReplica) } // TODO(benesch): consider discovering and GC'ing *all* overlapping ranges, @@ -732,47 +738,28 @@ func (s *Store) checkSnapshotOverlapLocked( // snapshot. func (s *Store) shouldAcceptSnapshotData( ctx context.Context, snapHeader *SnapshotRequest_Header, -) error { +) (err error) { if snapHeader.IsPreemptive() { return crdberrors.AssertionFailedf(`expected a raft or learner snapshot`) } - - s.mu.Lock() - defer s.mu.Unlock() - - // TODO(tbg): see the comment on desc.Generation for what seems to be a much - // saner way to handle overlap via generational semantics. - desc := *snapHeader.State.Desc - - // First, check for an existing Replica. - if v, ok := s.mu.replicas.Load( - int64(desc.RangeID), - ); ok { - existingRepl := (*Replica)(v) - existingRepl.mu.RLock() - existingIsInitialized := existingRepl.isInitializedRLocked() - existingRepl.mu.RUnlock() - - if existingIsInitialized { - // Regular Raft snapshots can't be refused at this point, - // even if they widen the existing replica. See the comments - // in Replica.maybeAcquireSnapshotMergeLock for how this is - // made safe. - // - // NB: we expect the replica to know its replicaID at this point - // (i.e. !existingIsPreemptive), though perhaps it's possible - // that this isn't true if the leader initiates a Raft snapshot - // (that would provide a range descriptor with this replica in - // it) but this node reboots (temporarily forgetting its - // replicaID) before the snapshot arrives. + pErr := s.withReplicaForRequest(ctx, &snapHeader.RaftMessageRequest, + func(ctx context.Context, r *Replica) *roachpb.Error { + // If the current replica is not initialized then we should accept this + // snapshot if it doesn't overlap existing ranges. + if !r.IsInitialized() { + s.mu.Lock() + defer s.mu.Unlock() + return roachpb.NewError(s.checkSnapshotOverlapLocked(ctx, snapHeader)) + } + // If the current range is initialized then we need to accept this + // this snapshot. There's a hidden nasty case here during 19.2 where + // our currently initialized range is due to a preemptive snapshot and + // we've now assigned that range a replica ID. Fundamentally at this + // point we want to clear out the pre-emptive snapshot because applying + // learner snapshot over top is likely going to be problematic. return nil - } - } - - // We have a key range [desc.StartKey,desc.EndKey) which we want to apply a - // snapshot for. Is there a conflicting existing placeholder or an - // overlapping range? - return s.checkSnapshotOverlapLocked(ctx, snapHeader) + }) + return pErr.GoError() } // receiveSnapshot receives an incoming snapshot via a pre-opened GRPC stream. diff --git a/pkg/storage/store_snapshot_test.go b/pkg/storage/store_snapshot_test.go index 12c10f2e7316..adec003cf74f 100644 --- a/pkg/storage/store_snapshot_test.go +++ b/pkg/storage/store_snapshot_test.go @@ -114,7 +114,7 @@ func TestSnapshotPreemptiveOnUninitializedReplica(t *testing.T) { store, _ := createTestStore(t, testStoreOpts{}, stopper) // Create an uninitialized replica. - repl, created, err := store.getOrCreateReplica(ctx, 77, 1, nil) + repl, created, err := store.getOrCreateReplica(ctx, 77, 1, nil, true) if err != nil { t.Fatal(err) } diff --git a/pkg/storage/store_test.go b/pkg/storage/store_test.go index 8829fd7a15cb..dc57a80cba01 100644 --- a/pkg/storage/store_test.go +++ b/pkg/storage/store_test.go @@ -569,8 +569,8 @@ func TestStoreAddRemoveRanges(t *testing.T) { // Try to remove range 1 again. if err := store.RemoveReplica(context.Background(), repl1, repl1.Desc().NextReplicaID, RemoveOptions{ DestroyData: true, - }); err == nil { - t.Fatal("expected error re-removing same range") + }); err != nil { + t.Fatalf("didn't expect error re-removing same range: %v", err) } // Try to add a range with previously-used (but now removed) ID. repl2Dup := createReplica(store, 1, roachpb.RKey("a"), roachpb.RKey("b")) @@ -712,11 +712,11 @@ func TestStoreRemoveReplicaDestroy(t *testing.T) { // Verify that removal of a replica marks it as destroyed so that future raft // commands on the Replica will silently be dropped. - if err := repl1.withRaftGroup(true, func(r *raft.RawNode) (bool, error) { + isRemoved, err := repl1.withRaftGroup(true, func(r *raft.RawNode) (bool, error) { return true, errors.Errorf("unexpectedly created a raft group") - }); err != nil { - t.Fatal(err) - } + }) + require.True(t, isRemoved) + require.NoError(t, err) repl1.mu.Lock() expErr := roachpb.NewError(repl1.mu.destroyStatus.err) @@ -1354,7 +1354,7 @@ func splitTestRange(store *Store, key, splitKey roachpb.RKey, t *testing.T) *Rep require.NoError(t, err) newLeftDesc := *repl.Desc() newLeftDesc.EndKey = splitKey - err = store.SplitRange(repl.AnnotateCtx(context.TODO()), repl, newRng, newLeftDesc) + err = store.SplitRange(repl.AnnotateCtx(context.TODO()), repl, newRng, newLeftDesc, nil) require.NoError(t, err) return newRng } @@ -2953,104 +2953,6 @@ func TestStoreRemovePlaceholderOnRaftIgnored(t *testing.T) { }) } -// Test that we set proper tombstones for removed replicas and use the -// tombstone to reject attempts to create a replica with a lesser ID. -func TestRemovedReplicaTombstone(t *testing.T) { - defer leaktest.AfterTest(t)() - - const rangeID = 1 - creatingReplica := roachpb.ReplicaDescriptor{ - NodeID: 2, - StoreID: 2, - ReplicaID: 2, - } - - // All test cases assume that the starting replica ID is 1. This assumption - // is enforced by a check within the test logic. - testCases := []struct { - setReplicaID roachpb.ReplicaID // set the existing replica to this before removing it - descNextReplicaID roachpb.ReplicaID // the descriptor's NextReplicaID during replica removal - createReplicaID roachpb.ReplicaID // try creating a replica at this ID - expectCreated bool - }{ - {1, 2, 2, true}, - {1, 2, 1, false}, - {1, 2, 1, false}, - {1, 3, 1, false}, - {1, 3, 2, false}, - {1, 3, 3, true}, - {1, 99, 98, false}, - {1, 99, 99, true}, - {2, 2, 2, false}, - {2, 2, 3, true}, - {2, 2, 99, true}, - {98, 2, 98, false}, - {98, 2, 99, true}, - } - for _, c := range testCases { - t.Run("", func(t *testing.T) { - tc := testContext{} - stopper := stop.NewStopper() - ctx := context.TODO() - defer stopper.Stop(ctx) - tc.Start(t, stopper) - s := tc.store - - repl1, err := s.GetReplica(rangeID) - if err != nil { - t.Fatal(err) - } - repl1.mu.Lock() - if repl1.mu.replicaID != 1 { - repl1.mu.Unlock() - t.Fatalf("test precondition not met; expected ReplicaID=1, got %d", repl1.mu.replicaID) - } - repl1.mu.Unlock() - - // Try to trigger a race where the replica ID gets increased during the GC - // process by taking the store lock and inserting a short sleep to cause - // the goroutine to start running the setReplicaID call. - errChan := make(chan error) - - func() { - repl1.raftMu.Lock() - defer repl1.raftMu.Unlock() - s.mu.Lock() - defer s.mu.Unlock() - repl1.mu.Lock() - defer repl1.mu.Unlock() - - go func() { - errChan <- s.RemoveReplica(ctx, repl1, c.descNextReplicaID, RemoveOptions{DestroyData: true}) - }() - - time.Sleep(1 * time.Millisecond) - - if err := repl1.setReplicaIDRaftMuLockedMuLocked(c.setReplicaID); err != nil { - t.Fatal(err) - } - }() - - if err := <-errChan; testutils.IsError(err, "replica ID has changed") { - // We didn't trigger the race, so just return success. - return - } else if err != nil { - t.Fatal(err) - } - - _, created, err := s.getOrCreateReplica(ctx, rangeID, c.createReplicaID, &creatingReplica) - if created != c.expectCreated { - t.Errorf("expected s.getOrCreateReplica(%d, %d, %v).created=%v, got %v", - rangeID, c.createReplicaID, creatingReplica, c.expectCreated, created) - } - if !c.expectCreated && !testutils.IsError(err, "raft group deleted") { - t.Errorf("expected s.getOrCreateReplica(%d, %d, %v).err='raft group deleted', got %v", - rangeID, c.createReplicaID, creatingReplica, err) - } - }) - } -} - type fakeSnapshotStream struct { nextResp *SnapshotResponse nextErr error diff --git a/pkg/storage/testing_knobs.go b/pkg/storage/testing_knobs.go index 7ee8dccf6953..91d5b48e8f9f 100644 --- a/pkg/storage/testing_knobs.go +++ b/pkg/storage/testing_knobs.go @@ -139,6 +139,12 @@ type StoreTestingKnobs struct { // DisableRefreshReasonTicks disables refreshing pending commands // periodically. DisableRefreshReasonTicks bool + // DisableEagerReplicaRemoval prevents the Replica from destroying itself + // when it encounters a ChangeReplicasTrigger which would remove it. + // This option can lead to nasty cases during shutdown where a replica will + // spin attempting to acquire a split or merge lock on a RHS which will + // always fail and is generally not safe but is useful for testing. + DisableEagerReplicaRemoval bool // RefreshReasonTicksPeriod overrides the default period over which // pending commands are refreshed. The period is specified as a multiple // of Raft group ticks.