From 76483d9ac7566414f0898af534e79f38241c4926 Mon Sep 17 00:00:00 2001 From: Andrew Werner Date: Wed, 4 Sep 2019 11:25:27 -0400 Subject: [PATCH] storage: ensure Replica objects never change replicaID We've seen instability recently due to invariants being violated as replicas catch up across periods of being removed and re-added to a range. Due to learner replicas and their rollback behavior this is now a relatively common case. Rather than handle all of these various scenarios this PR prevents them from occuring by actively removing replicas when we determine that they must have been removed. Here's a high level overview of the change: * Once a Replica object has a non-zero Replica.mu.replicaID it will not change. * In this commit however, if a node crashes it may forget that it learned about a replica ID. * If a raft message or snapshot addressed to a higher replica ID is received the current replica will be removed completely. * If a replica sees a ChangeReplicasTrigger which removes it then it completely removes itself while applying that command. * Replica.mu.destroyStatus is used to meaningfully signify the removal state of a Replica. Replicas about to be synchronously removed are in destroyReasonRemovalPending. This hopefully gives us some new invariants: * There is only ever at most 1 Replica which IsAlive() for a range on a Store at a time. * Once a Replica has a non-zero ReplicaID is never changes. * This applies only to the in-memory object, not the store itself. * Once a Replica applies a command as a part of the range descriptor it will never apply another command as a different Replica ID or outside of the Range. * Corrolary: a Replica created as a learner will only ever apply commands while that replica is in the range. The change also introduces some new complexity. Namely we now allow removal of uninitialized replicas, including their hard state. This allows us to catch up across a split even when we know the RHS must have been removed. Fixes #40367. Issue https://github.com/cockroachdb/cockroach/issues/38772#issuecomment-527721688 manifests itself as the RHS not being found for a merge. This happens because the Replica is processing commands to catch itself up while it is not in the range. This is no longer possible. Fixes #40257. Issue #40257 is another case of a replica processing commands while it is not in the range. Fixes #40470. Issue #40470 is caused by a RHS learning about its existence and removal prior to a LHS processing a split. This case is now handled properly and is tested. Release justification: This commit is safe for 19.2 because it fixes release blockers. Release note (bug fix): Avoid internal re-use of Replica objects to fix the following crashes: #38772 "found rXXX:{-} [, next=0, gen=0?] in place of the RHS" #39796 "replica descriptor of local store not found in right hand side of split" #40470 "split trigger found right-hand side with tombstone" #40257 "snapshot widens existing replica, but no replica exists for subsumed key" --- pkg/storage/apply/task.go | 5 + pkg/storage/client_merge_test.go | 122 ++-- pkg/storage/client_metrics_test.go | 6 +- pkg/storage/client_raft_helpers_test.go | 291 +++++++++ pkg/storage/client_raft_test.go | 533 +++++++++++++++- pkg/storage/client_replica_gc_test.go | 5 +- pkg/storage/client_replica_test.go | 220 +++++++ pkg/storage/client_split_test.go | 11 +- pkg/storage/client_test.go | 40 +- pkg/storage/replica.go | 75 ++- pkg/storage/replica_application_result.go | 49 +- .../replica_application_state_machine.go | 130 +++- pkg/storage/replica_destroy.go | 86 ++- pkg/storage/replica_gc_queue.go | 37 +- pkg/storage/replica_init.go | 72 +-- pkg/storage/replica_learner_test.go | 12 +- pkg/storage/replica_raft.go | 66 +- pkg/storage/replica_raftstorage.go | 10 +- pkg/storage/replica_rangefeed_test.go | 20 +- pkg/storage/replica_stats.go | 16 + pkg/storage/replica_test.go | 117 +--- pkg/storage/store.go | 572 +++++++++++------- pkg/storage/store_snapshot.go | 67 +- pkg/storage/store_snapshot_preemptive.go | 4 +- pkg/storage/store_test.go | 120 +--- pkg/storage/testing_knobs.go | 7 + 26 files changed, 1926 insertions(+), 767 deletions(-) create mode 100644 pkg/storage/client_raft_helpers_test.go diff --git a/pkg/storage/apply/task.go b/pkg/storage/apply/task.go index 8bfa62acd0a1..26e9ef9511dd 100644 --- a/pkg/storage/apply/task.go +++ b/pkg/storage/apply/task.go @@ -12,6 +12,7 @@ package apply import ( "context" + "errors" "go.etcd.io/etcd/raft/raftpb" ) @@ -54,6 +55,10 @@ type StateMachine interface { ApplySideEffects(CheckedCommand) (AppliedCommand, error) } +// ErrRemoved can be returned from ApplySideEffects which will stop the +// task from processing more commands and return immediately. +var ErrRemoved = errors.New("replica removed") + // Batch accumulates a series of updates from Commands and performs them // all at once to its StateMachine when applied. Groups of Commands will be // staged in the Batch such that one or more trivial Commands are staged or diff --git a/pkg/storage/client_merge_test.go b/pkg/storage/client_merge_test.go index 9917ada54c26..ae0068469f2b 100644 --- a/pkg/storage/client_merge_test.go +++ b/pkg/storage/client_merge_test.go @@ -57,7 +57,6 @@ import ( "github.com/pkg/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "go.etcd.io/etcd/raft" "go.etcd.io/etcd/raft/raftpb" ) @@ -1641,6 +1640,7 @@ func TestStoreReplicaGCAfterMerge(t *testing.T) { storeCfg.TestingKnobs.DisableReplicateQueue = true storeCfg.TestingKnobs.DisableReplicaGCQueue = true storeCfg.TestingKnobs.DisableMergeQueue = true + storeCfg.TestingKnobs.DisableEagerReplicaRemoval = true mtc := &multiTestContext{storeConfig: &storeCfg} mtc.Start(t, 2) defer mtc.Stop() @@ -2043,8 +2043,20 @@ func TestStoreRangeMergeSlowAbandonedFollower(t *testing.T) { lhsRepl2.RaftUnlock() // Ensure that the unblocked merge eventually applies and subsumes the RHS. + // In general this will happen due to receiving a ReplicaTooOldError but + // it may require the replica GC queue. In rare cases the LHS will never + // hear about the merge and may need to be GC'd on its own. testutils.SucceedsSoon(t, func() error { - if _, err := store2.GetReplica(rhsDesc.RangeID); err == nil { + // Make the the LHS gets destroyed. + if lhsRepl, err := store2.GetReplica(lhsDesc.RangeID); err == nil { + if err := store2.ManualReplicaGC(lhsRepl); err != nil { + t.Fatal(err) + } + } + if rhsRepl, err := store2.GetReplica(rhsDesc.RangeID); err == nil { + if err := store2.ManualReplicaGC(rhsRepl); err != nil { + t.Fatal(err) + } return errors.New("rhs not yet destroyed") } return nil @@ -2060,6 +2072,7 @@ func TestStoreRangeMergeAbandonedFollowers(t *testing.T) { storeCfg.TestingKnobs.DisableReplicaGCQueue = true storeCfg.TestingKnobs.DisableSplitQueue = true storeCfg.TestingKnobs.DisableMergeQueue = true + storeCfg.TestingKnobs.DisableEagerReplicaRemoval = true mtc := &multiTestContext{storeConfig: &storeCfg} mtc.Start(t, 3) defer mtc.Stop() @@ -2827,74 +2840,6 @@ func TestStoreRangeMergeSlowWatcher(t *testing.T) { } } -// unreliableRaftHandler drops all Raft messages that are addressed to the -// specified rangeID, but lets all other messages through. -type unreliableRaftHandler struct { - rangeID roachpb.RangeID - storage.RaftMessageHandler - // If non-nil, can return false to avoid dropping a msg to rangeID - dropReq func(*storage.RaftMessageRequest) bool - dropHB func(*storage.RaftHeartbeat) bool - dropResp func(*storage.RaftMessageResponse) bool -} - -func (h *unreliableRaftHandler) HandleRaftRequest( - ctx context.Context, - req *storage.RaftMessageRequest, - respStream storage.RaftMessageResponseStream, -) *roachpb.Error { - if len(req.Heartbeats)+len(req.HeartbeatResps) > 0 { - reqCpy := *req - req = &reqCpy - req.Heartbeats = h.filterHeartbeats(req.Heartbeats) - req.HeartbeatResps = h.filterHeartbeats(req.HeartbeatResps) - if len(req.Heartbeats)+len(req.HeartbeatResps) == 0 { - // Entirely filtered. - return nil - } - } else if req.RangeID == h.rangeID { - if h.dropReq == nil || h.dropReq(req) { - log.Infof( - ctx, - "dropping Raft message %s", - raft.DescribeMessage(req.Message, func([]byte) string { - return "" - }), - ) - - return nil - } - } - return h.RaftMessageHandler.HandleRaftRequest(ctx, req, respStream) -} - -func (h *unreliableRaftHandler) filterHeartbeats( - hbs []storage.RaftHeartbeat, -) []storage.RaftHeartbeat { - if len(hbs) == 0 { - return hbs - } - var cpy []storage.RaftHeartbeat - for i := range hbs { - hb := &hbs[i] - if hb.RangeID != h.rangeID || (h.dropHB != nil && !h.dropHB(hb)) { - cpy = append(cpy, *hb) - } - } - return cpy -} - -func (h *unreliableRaftHandler) HandleRaftResponse( - ctx context.Context, resp *storage.RaftMessageResponse, -) error { - if resp.RangeID == h.rangeID { - if h.dropResp == nil || h.dropResp(resp) { - return nil - } - } - return h.RaftMessageHandler.HandleRaftResponse(ctx, resp) -} - func TestStoreRangeMergeRaftSnapshot(t *testing.T) { defer leaktest.AfterTest(t)() @@ -3082,20 +3027,22 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) { mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{ rangeID: aRepl0.RangeID, RaftMessageHandler: store2, - dropReq: func(req *storage.RaftMessageRequest) bool { - // Make sure that even going forward no MsgApp for what we just - // truncated can make it through. The Raft transport is asynchronous - // so this is necessary to make the test pass reliably - otherwise - // the follower on store2 may catch up without needing a snapshot, - // tripping up the test. - // - // NB: the Index on the message is the log index that _precedes_ any of the - // entries in the MsgApp, so filter where msg.Index < index, not <= index. - return req.Message.Type == raftpb.MsgApp && req.Message.Index < index + dropFuncs: dropFuncs{ + dropReq: func(req *storage.RaftMessageRequest) bool { + // Make sure that even going forward no MsgApp for what we just + // truncated can make it through. The Raft transport is asynchronous + // so this is necessary to make the test pass reliably - otherwise + // the follower on store2 may catch up without needing a snapshot, + // tripping up the test. + // + // NB: the Index on the message is the log index that _precedes_ any of the + // entries in the MsgApp, so filter where msg.Index < index, not <= index. + return req.Message.Type == raftpb.MsgApp && req.Message.Index < index + }, + // Don't drop heartbeats or responses. + dropHB: func(*storage.RaftHeartbeat) bool { return false }, + dropResp: func(*storage.RaftMessageResponse) bool { return false }, }, - // Don't drop heartbeats or responses. - dropHB: func(*storage.RaftHeartbeat) bool { return false }, - dropResp: func(*storage.RaftMessageResponse) bool { return false }, }) // Wait for all replicas to catch up to the same point. Because we truncated @@ -3372,9 +3319,12 @@ func TestMergeQueue(t *testing.T) { t.Run("non-collocated", func(t *testing.T) { reset(t) verifyUnmerged(t) - mtc.replicateRange(rhs().RangeID, 1) - mtc.transferLease(ctx, rhs().RangeID, 0, 1) - mtc.unreplicateRange(rhs().RangeID, 0) + rhsRangeID := rhs().RangeID + mtc.replicateRange(rhsRangeID, 1) + mtc.transferLease(ctx, rhsRangeID, 0, 1) + mtc.unreplicateRange(rhsRangeID, 0) + require.NoError(t, mtc.waitForUnreplicated(rhsRangeID, 0)) + clearRange(t, lhsStartKey, rhsEndKey) store.MustForceMergeScanAndProcess() verifyMerged(t) diff --git a/pkg/storage/client_metrics_test.go b/pkg/storage/client_metrics_test.go index 4fd43fcbcc21..c91d386c9e81 100644 --- a/pkg/storage/client_metrics_test.go +++ b/pkg/storage/client_metrics_test.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/pkg/errors" + "github.com/stretchr/testify/require" ) func checkGauge(t *testing.T, id string, g *metric.Gauge, e int64) { @@ -313,8 +314,9 @@ func TestStoreMetrics(t *testing.T) { return mtc.unreplicateRangeNonFatal(replica.RangeID, 0) }) - // Force GC Scan on store 0 in order to fully remove range. - mtc.stores[1].MustForceReplicaGCScanAndProcess() + // Wait until we're sure that store 0 has successfully processed its removal. + require.NoError(t, mtc.waitForUnreplicated(replica.RangeID, 0)) + mtc.waitForValues(roachpb.Key("z"), []int64{0, 5, 5}) // Verify range count is as expected. diff --git a/pkg/storage/client_raft_helpers_test.go b/pkg/storage/client_raft_helpers_test.go new file mode 100644 index 000000000000..0d9bf9cd4945 --- /dev/null +++ b/pkg/storage/client_raft_helpers_test.go @@ -0,0 +1,291 @@ +// Copyright 2019 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package storage_test + +import ( + "context" + "errors" + + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/storage" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/syncutil" + "go.etcd.io/etcd/raft" +) + +type dropFuncs struct { + // If non-nil, can return false to avoid dropping a msg to rangeID + dropReq func(*storage.RaftMessageRequest) bool + dropHB func(*storage.RaftHeartbeat) bool + dropResp func(*storage.RaftMessageResponse) bool + dropSnap func(*storage.SnapshotRequest_Header) error +} + +// unreliableRaftHandler drops all Raft messages that are addressed to the +// specified rangeID, but lets all other messages through. +type unreliableRaftHandler struct { + rangeID roachpb.RangeID + storage.RaftMessageHandler + dropFuncs +} + +func (h *unreliableRaftHandler) HandleRaftRequest( + ctx context.Context, + req *storage.RaftMessageRequest, + respStream storage.RaftMessageResponseStream, +) *roachpb.Error { + if len(req.Heartbeats)+len(req.HeartbeatResps) > 0 { + reqCpy := *req + req = &reqCpy + req.Heartbeats = h.filterHeartbeats(req.Heartbeats) + req.HeartbeatResps = h.filterHeartbeats(req.HeartbeatResps) + if len(req.Heartbeats)+len(req.HeartbeatResps) == 0 { + // Entirely filtered. + return nil + } + } else if req.RangeID == h.rangeID { + if h.dropReq == nil || h.dropReq(req) { + log.Infof( + ctx, + "dropping r%d Raft message %s", + req.RangeID, + raft.DescribeMessage(req.Message, func([]byte) string { + return "" + }), + ) + + return nil + } + } + return h.RaftMessageHandler.HandleRaftRequest(ctx, req, respStream) +} + +func (h *unreliableRaftHandler) filterHeartbeats( + hbs []storage.RaftHeartbeat, +) []storage.RaftHeartbeat { + if len(hbs) == 0 { + return hbs + } + var cpy []storage.RaftHeartbeat + for i := range hbs { + hb := &hbs[i] + if hb.RangeID != h.rangeID || (h.dropHB != nil && !h.dropHB(hb)) { + cpy = append(cpy, *hb) + } + } + return cpy +} + +func (h *unreliableRaftHandler) HandleRaftResponse( + ctx context.Context, resp *storage.RaftMessageResponse, +) error { + if resp.RangeID == h.rangeID { + if h.dropResp == nil || h.dropResp(resp) { + return nil + } + } + return h.RaftMessageHandler.HandleRaftResponse(ctx, resp) +} + +func (h *unreliableRaftHandler) HandleSnapshot( + header *storage.SnapshotRequest_Header, respStream storage.SnapshotResponseStream, +) error { + if header.RaftMessageRequest.RangeID == h.rangeID && h.dropSnap != nil { + if err := h.dropSnap(header); err != nil { + return err + } + } + return h.RaftMessageHandler.HandleSnapshot(header, respStream) +} + +// mtcStoreRaftMessageHandler exists to allows a store to be stopped and +// restarted while maintaining a partition using an unreliableRaftHandler. +type mtcStoreRaftMessageHandler struct { + mtc *multiTestContext + storeIdx int +} + +func (h *mtcStoreRaftMessageHandler) HandleRaftRequest( + ctx context.Context, + req *storage.RaftMessageRequest, + respStream storage.RaftMessageResponseStream, +) *roachpb.Error { + return h.mtc.Store(h.storeIdx).HandleRaftRequest(ctx, req, respStream) +} + +func (h *mtcStoreRaftMessageHandler) HandleRaftResponse( + ctx context.Context, resp *storage.RaftMessageResponse, +) error { + return h.mtc.Store(h.storeIdx).HandleRaftResponse(ctx, resp) +} + +func (h *mtcStoreRaftMessageHandler) HandleSnapshot( + header *storage.SnapshotRequest_Header, respStream storage.SnapshotResponseStream, +) error { + return h.mtc.Store(h.storeIdx).HandleSnapshot(header, respStream) +} + +// mtcPartitionedRange is a convenient abstraction to create a range on a node +// in a multiTestContext which can be partitioned and unpartitioned. +type mtcPartitionedRange struct { + rangeID roachpb.RangeID + mu struct { + syncutil.RWMutex + partitionedNode int + partitioned bool + partitionedReplicas map[roachpb.ReplicaID]bool + } + handlers []storage.RaftMessageHandler +} + +// setupPartitionedRange sets up an mtcPartitionedRange for the provided mtc, +// rangeID, and node index in the mtc. The range is initially not partitioned. +// +// We're going to set up the cluster with partitioning so that we can +// partition node p from the others. We do this by installing +// unreliableRaftHandler listeners on all three Stores which we can enable +// and disable with an atomic. The handler on the partitioned store filters +// out all messages while the handler on the other two stores only filters +// out messages from the partitioned store. When activated the configuration +// looks like: +// +// [p] +// x x +// / \ +// x x +// [*]<---->[*] +// +// The activated argument controls whether the partition is activated when this +// function returns. +// +// If replicaID is zero then it is resolved by looking up the replica for the +// partitionedNode of from the current range descriptor of rangeID. +func setupPartitionedRange( + mtc *multiTestContext, + rangeID roachpb.RangeID, + replicaID roachpb.ReplicaID, + partitionedNode int, + activated bool, + funcs dropFuncs, +) (*mtcPartitionedRange, error) { + handlers := make([]storage.RaftMessageHandler, 0, len(mtc.stores)) + for i := range mtc.stores { + handlers = append(handlers, &mtcStoreRaftMessageHandler{ + mtc: mtc, + storeIdx: i, + }) + } + return setupPartitionedRangeWithHandlers(mtc, rangeID, replicaID, partitionedNode, activated, handlers, funcs) +} + +func setupPartitionedRangeWithHandlers( + mtc *multiTestContext, + rangeID roachpb.RangeID, + replicaID roachpb.ReplicaID, + partitionedNode int, + activated bool, + handlers []storage.RaftMessageHandler, + funcs dropFuncs, +) (*mtcPartitionedRange, error) { + pr := &mtcPartitionedRange{ + rangeID: rangeID, + handlers: make([]storage.RaftMessageHandler, 0, len(handlers)), + } + pr.mu.partitioned = activated + pr.mu.partitionedNode = partitionedNode + if replicaID == 0 { + partRepl, err := mtc.Store(partitionedNode).GetReplica(rangeID) + if err != nil { + return nil, err + } + partReplDesc, err := partRepl.GetReplicaDescriptor() + if err != nil { + return nil, err + } + replicaID = partReplDesc.ReplicaID + } + pr.mu.partitionedReplicas = map[roachpb.ReplicaID]bool{ + replicaID: true, + } + for i := range mtc.stores { + s := i + h := &unreliableRaftHandler{ + rangeID: rangeID, + RaftMessageHandler: handlers[s], + dropFuncs: funcs, + } + // Only filter messages from the partitioned store on the other + // two stores. + if h.dropReq == nil { + h.dropReq = func(req *storage.RaftMessageRequest) bool { + pr.mu.RLock() + defer pr.mu.RUnlock() + return pr.mu.partitioned && + (s == pr.mu.partitionedNode || + req.FromReplica.StoreID == roachpb.StoreID(pr.mu.partitionedNode)+1) + } + } + if h.dropHB == nil { + h.dropHB = func(hb *storage.RaftHeartbeat) bool { + pr.mu.RLock() + defer pr.mu.RUnlock() + if !pr.mu.partitioned { + return false + } + if s == partitionedNode { + return true + } + return pr.mu.partitionedReplicas[hb.FromReplicaID] + } + } + if h.dropSnap == nil { + h.dropSnap = func(header *storage.SnapshotRequest_Header) error { + pr.mu.RLock() + defer pr.mu.RUnlock() + if !pr.mu.partitioned { + return nil + } + if pr.mu.partitionedReplicas[header.RaftMessageRequest.ToReplica.ReplicaID] { + return errors.New("partitioned") + } + return nil + } + } + pr.handlers = append(pr.handlers, h) + mtc.transport.Listen(mtc.stores[s].Ident.StoreID, h) + } + return pr, nil +} + +func (pr *mtcPartitionedRange) deactivate() { pr.set(false) } +func (pr *mtcPartitionedRange) activate() { pr.set(true) } +func (pr *mtcPartitionedRange) set(active bool) { + pr.mu.Lock() + defer pr.mu.Unlock() + pr.mu.partitioned = active +} + +func (pr *mtcPartitionedRange) addReplica(replicaID roachpb.ReplicaID) { + pr.mu.Lock() + defer pr.mu.Unlock() + pr.mu.partitionedReplicas[replicaID] = true +} + +func (pr *mtcPartitionedRange) extend( + mtc *multiTestContext, + rangeID roachpb.RangeID, + replicaID roachpb.ReplicaID, + partitionedNode int, + activated bool, + funcs dropFuncs, +) (*mtcPartitionedRange, error) { + return setupPartitionedRangeWithHandlers(mtc, rangeID, replicaID, partitionedNode, activated, pr.handlers, funcs) +} diff --git a/pkg/storage/client_raft_test.go b/pkg/storage/client_raft_test.go index 12335fcc96ea..0c510970ae52 100644 --- a/pkg/storage/client_raft_test.go +++ b/pkg/storage/client_raft_test.go @@ -33,6 +33,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/engine" + "github.com/cockroachdb/cockroach/pkg/storage/stateloader" "github.com/cockroachdb/cockroach/pkg/storage/storagebase" "github.com/cockroachdb/cockroach/pkg/storage/storagepb" "github.com/cockroachdb/cockroach/pkg/testutils" @@ -973,16 +974,18 @@ func TestSnapshotAfterTruncationWithUncommittedTail(t *testing.T) { mtc.transport.Listen(mtc.stores[s].Ident.StoreID, &unreliableRaftHandler{ rangeID: 1, RaftMessageHandler: mtc.stores[s], - dropReq: func(req *storage.RaftMessageRequest) bool { - // Make sure that even going forward no MsgApp for what we just truncated can - // make it through. The Raft transport is asynchronous so this is necessary - // to make the test pass reliably. - // NB: the Index on the message is the log index that _precedes_ any of the - // entries in the MsgApp, so filter where msg.Index < index, not <= index. - return req.Message.Type == raftpb.MsgApp && req.Message.Index < index + dropFuncs: dropFuncs{ + dropReq: func(req *storage.RaftMessageRequest) bool { + // Make sure that even going forward no MsgApp for what we just truncated can + // make it through. The Raft transport is asynchronous so this is necessary + // to make the test pass reliably. + // NB: the Index on the message is the log index that _precedes_ any of the + // entries in the MsgApp, so filter where msg.Index < index, not <= index. + return req.Message.Type == raftpb.MsgApp && req.Message.Index < index + }, + dropHB: func(*storage.RaftHeartbeat) bool { return false }, + dropResp: func(*storage.RaftMessageResponse) bool { return false }, }, - dropHB: func(*storage.RaftHeartbeat) bool { return false }, - dropResp: func(*storage.RaftMessageResponse) bool { return false }, }) } @@ -1192,15 +1195,12 @@ func TestReplicateAfterRemoveAndSplit(t *testing.T) { return err } - if err := replicateRHS(); !testutils.IsError(err, storage.IntersectingSnapshotMsg) { - t.Fatalf("unexpected error %v", err) - } - - // Enable the replica GC queue so that the next attempt to replicate the RHS - // to store 2 will cause the obsolete replica to be GC'd allowing a - // subsequent replication to succeed. - mtc.stores[2].SetReplicaGCQueueActive(true) - + // This used to fail with IntersectingSnapshotMsg because we relied on replica + // GC to remove the LHS and that queue is disabled. Now we will detect that + // the LHS is not part of the range because of a ReplicaTooOldError and then + // we'll replicaGC the LHS in response. + // TODO(ajwerner): filter the reponses to node 2 or disable this eager + // replicaGC. testutils.SucceedsSoon(t, replicateRHS) } @@ -1883,6 +1883,7 @@ func testReplicaAddRemove(t *testing.T, addFirst bool) { // replica GC queue does its work, so we disable the replica gc queue here // and run it manually when we're ready. sc.TestingKnobs.DisableReplicaGCQueue = true + sc.TestingKnobs.DisableEagerReplicaRemoval = true mtc := &multiTestContext{ storeConfig: &sc, // This test was written before the multiTestContext started creating many @@ -3000,6 +3001,10 @@ func TestReplicateRogueRemovedNode(t *testing.T) { defer mtc.Stop() mtc.Start(t, 3) + // We're going to set up the cluster with partitioning so that we can + // partition node 0 from the others. The partition is not initially active. + partRange, err := setupPartitionedRange(mtc, 1, 0, 0, false /* activated */, dropFuncs{}) + require.NoError(t, err) // First put the range on all three nodes. raftID := roachpb.RangeID(1) mtc.replicateRange(raftID, 1, 2) @@ -3044,7 +3049,9 @@ func TestReplicateRogueRemovedNode(t *testing.T) { } return nil }) - + // Partition nodes 1 and 2 from node 0. Otherwise they'd get a + // ReplicaTooOldError from node 0 and proceed to remove themselves. + partRange.activate() // Bring node 2 back up. mtc.restartStore(2) @@ -3547,6 +3554,7 @@ func TestRemoveRangeWithoutGC(t *testing.T) { sc := storage.TestStoreConfig(nil) sc.TestingKnobs.DisableReplicaGCQueue = true + sc.TestingKnobs.DisableEagerReplicaRemoval = true mtc := &multiTestContext{storeConfig: &sc} defer mtc.Stop() mtc.Start(t, 2) @@ -3597,12 +3605,21 @@ func TestRemoveRangeWithoutGC(t *testing.T) { mtc.advanceClock(context.TODO()) mtc.manualClock.Increment(int64(storage.ReplicaGCQueueInactivityThreshold + 1)) mtc.stores[0].SetReplicaGCQueueActive(true) - mtc.stores[0].MustForceReplicaGCScanAndProcess() + // There's a fun flake where between when the queue detects that this replica + // needs to be removed and when it actually gets processed whereby an older + // replica will send this replica a raft message which will give it an ID. + // When our replica ID changes the queue will ignore the previous addition and + // we won't be removed. + testutils.SucceedsSoon(t, func() error { + mtc.stores[0].MustForceReplicaGCScanAndProcess() - // The Replica object should be removed. - if _, err := mtc.stores[0].GetReplica(rangeID); !testutils.IsError(err, "r[0-9]+ was not found") { - t.Fatalf("expected replica to be missing; got %v", err) - } + // The Replica object should be removed. + const msg = "r[0-9]+ was not found" + if _, err := mtc.stores[0].GetReplica(rangeID); !testutils.IsError(err, msg) { + return errors.Errorf("expected %s, got %v", msg, err) + } + return nil + }) // And the data should no longer be on disk. if ok, err := engine.MVCCGetProto(context.Background(), mtc.stores[0].Engine(), descKey, @@ -4327,8 +4344,10 @@ func TestTracingDoesNotRaceWithCancelation(t *testing.T) { mtc.transport.Listen(mtc.stores[i].Ident.StoreID, &unreliableRaftHandler{ rangeID: ri.Desc.RangeID, RaftMessageHandler: mtc.stores[i], - dropReq: func(req *storage.RaftMessageRequest) bool { - return rand.Intn(2) == 0 + dropFuncs: dropFuncs{ + dropReq: func(req *storage.RaftMessageRequest) bool { + return rand.Intn(2) == 0 + }, }, }) } @@ -4595,3 +4614,465 @@ func TestAckWriteBeforeApplication(t *testing.T) { }) } } + +// TestProcessSplitAfterRightHandSideHasBeenRemoved tests cases where we have +// a follower replica which has received information about the RHS of a split +// before it has processed that split. The replica can't both have an +// initialized RHS and process the split but it can have (1) an uninitialized +// RHS with a higher replica ID than lives in the split and (2) a RHS with +// an unknown replica ID and a tombstone at exactly the replica ID of the RHS. +// It may learn about a newer replica ID than the split without ever hearing +// about the split replica. If it does not crash (3) it will know that the +// split replica is too old and will not initialize it. If the node does +// crash (4) it will forget it had learned about the higher replica ID and +// will initialize the RHS as the split replica. +// +// Starting in 19.2 if a replica discovers from a raft message that it is an +// old replica then it knows that it has been removed and re-added to the range. +// In this case the Replica eagerly destroys itself and its data. +// +// Given this behavior there are 4 troubling cases with regards to splits. +// +// * In all cases we begin with s1 processing a presplit snapshot for +// r20. After the split the store should have r21/3. +// +// In the first two cases the following occurs: +// +// * s1 receives a message for r21/3 prior to acquiring the split lock +// in r21. This will create an uninitialized r21/3 which may write +// HardState. +// +// * Before the r20 processes the split r21 is removed and re-added to +// s1 as r21/4. s1 receives a raft message destined for r21/4 and proceeds +// to destroy its uninitialized r21/3, laying down a tombstone at 4 in the +// process. +// +// (1) s1 processes the split and finds the RHS to be an uninitialized replica +// with a higher replica ID. +// +// (2) s1 crashes before processing the split, forgetting the replica ID of the +// RHS but retaining its tombstone. +// +// In both cases we know that the RHS could not have committed anything because +// it cannot have gotten a snapshot but we want to be sure to not synthesize a +// HardState for the RHS that contains a non-zero commit index if we know that +// the RHS will need another snapshot later. +// +// In the third case: +// +// * s1 never receives a message for r21/3. +// +// * Before the r20 processes the split r21 is removed and re-added to +// s1 as r21/4. s1 receives a raft message destined for r21/4 and has never +// heard about r21/3. +// +// (3) s1 processes the split and finds the RHS to be an uninitialized replica +// with a higher replica ID (but without a tombstone). This case is very +// similar to (1) +// +// (4) s1 crashes still before processing the split, forgetting that it had +// known about r21/4. When it reboots r21/4 is totally partitioned and +// r20 becomes unpartitioned. +// +// * r20 processes the split successfully and initialized r21/3. +// +// In the 3rd case we find that until we unpartition r21/4 =, there will be +// a CommitIndex at 10 for initialized replica r21/3, the initial value. After +// r21/4 becomes unpartitioned it will learn it is removed by catching up on +// its log or receiving a ReplicaTooOldError and will write a tombstone. +// +func TestProcessSplitAfterRightHandSideHasBeenRemoved(t *testing.T) { + defer leaktest.AfterTest(t)() + sc := storage.TestStoreConfig(nil) + // Newly-started stores (including the "rogue" one) should not GC + // their replicas. We'll turn this back on when needed. + sc.TestingKnobs.DisableReplicaGCQueue = true + sc.RaftDelaySplitToSuppressSnapshotTicks = 0 + // Make the tick interval short so we don't need to wait too long for the + // partitioned leader to time out. Also make the + // RangeLeaseRaftElectionTimeout multiplier high so that system ranges + // like node liveness can actually get leases. + sc.RaftTickInterval = 10 * time.Millisecond + sc.RangeLeaseRaftElectionTimeoutMultiplier = 1000 + noopProposalFilter := storagebase.ReplicaProposalFilter(func(args storagebase.ProposalFilterArgs) *roachpb.Error { + return nil + }) + var proposalFilter atomic.Value + proposalFilter.Store(noopProposalFilter) + sc.TestingKnobs.TestingProposalFilter = func(args storagebase.ProposalFilterArgs) *roachpb.Error { + return proposalFilter.Load().(storagebase.ReplicaProposalFilter)(args) + } + + ctx := context.Background() + increment := func(t *testing.T, db *client.DB, key roachpb.Key, by int64) { + b := &client.Batch{} + b.AddRawRequest(incrementArgs(key, by)) + require.NoError(t, db.Run(ctx, b)) + } + changeReplicas := func( + t *testing.T, db *client.DB, typ roachpb.ReplicaChangeType, key roachpb.Key, idx int, + ) error { + ri, err := getRangeInfo(ctx, db, key) + require.NoError(t, err) + _, err = db.AdminChangeReplicas(ctx, ri.Desc.StartKey.AsRawKey(), ri.Desc, + roachpb.MakeReplicationChanges(typ, makeReplicationTargets(idx+1)...)) + return err + } + split := func(t *testing.T, db *client.DB, key roachpb.Key) { + b := &client.Batch{} + b.AddRawRequest(adminSplitArgs(key)) + require.NoError(t, db.Run(ctx, b)) + } + ensureNoTombstone := func(t *testing.T, store *storage.Store, rangeID roachpb.RangeID) { + var tombstone roachpb.RaftTombstone + tombstoneKey := keys.RaftTombstoneKey(rangeID) + ok, err := engine.MVCCGetProto( + ctx, store.Engine(), tombstoneKey, hlc.Timestamp{}, &tombstone, engine.MVCCGetOptions{}, + ) + require.NoError(t, err) + require.False(t, ok) + } + getHardState := func( + t *testing.T, store *storage.Store, rangeID roachpb.RangeID, + ) raftpb.HardState { + hs, err := stateloader.Make(rangeID).LoadHardState(ctx, store.Engine()) + require.NoError(t, err) + return hs + } + partitionReplicaOnSplit := func(t *testing.T, mtc *multiTestContext, key roachpb.Key, basePartition *mtcPartitionedRange, partRange **mtcPartitionedRange) { + // Set up a hook to partition the RHS range at its initial range ID + // before proposing the split trigger. + var setupOnce sync.Once + f := storagebase.ReplicaProposalFilter(func(args storagebase.ProposalFilterArgs) *roachpb.Error { + req, ok := args.Req.GetArg(roachpb.EndTransaction) + if !ok { + return nil + } + endTxn := req.(*roachpb.EndTransactionRequest) + if endTxn.InternalCommitTrigger == nil || endTxn.InternalCommitTrigger.SplitTrigger == nil { + return nil + } + split := endTxn.InternalCommitTrigger.SplitTrigger + + if !split.RightDesc.StartKey.Equal(key) { + return nil + } + setupOnce.Do(func() { + replDesc, ok := split.RightDesc.GetReplicaDescriptor(1) + require.True(t, ok) + var err error + *partRange, err = basePartition.extend(mtc, split.RightDesc.RangeID, replDesc.ReplicaID, + 0 /* partitionedNode */, true /* activated */, dropFuncs{}) + require.NoError(t, err) + proposalFilter.Store(noopProposalFilter) + }) + return nil + }) + proposalFilter.Store(f) + } + + // The basic setup for all of these tests are that we have a LHS range on 3 + // nodes and we've partitioned store 0 for the LHS range. The tests will now + // perform a split, remove the RHS, add it back and validate assumptions. + // + // Different outcomes will occur depending on whether and how the RHS is + // partitioned and whether the server is killed. In all cases we want the + // split to succeed and the RHS to eventually also be on all 3 nodes. + setup := func(t *testing.T) ( + mtc *multiTestContext, + db *client.DB, + keyA, keyB roachpb.Key, + lhsID roachpb.RangeID, + lhsPartition *mtcPartitionedRange, + ) { + mtc = &multiTestContext{ + storeConfig: &sc, + } + mtc.Start(t, 3) + + db = mtc.Store(1).DB() + + // Split off a non-system range so we don't have to account for node liveness + // traffic. + scratchTableKey := keys.MakeTablePrefix(math.MaxUint32) + // Put some data in the range so we'll have something to test for. + keyA = append(append(roachpb.Key{}, scratchTableKey...), 'a') + keyB = append(append(roachpb.Key{}, scratchTableKey...), 'b') + + split(t, db, scratchTableKey) + ri, err := getRangeInfo(ctx, db, scratchTableKey) + require.Nil(t, err) + lhsID = ri.Desc.RangeID + // First put the range on all three nodes. + mtc.replicateRange(lhsID, 1, 2) + + // Set up a partition for the LHS range only. Initially it is not active. + lhsPartition, err = setupPartitionedRange(mtc, lhsID, + 0 /* replicaID */, 0 /* partitionedNode */, false /* activated */, dropFuncs{}) + require.NoError(t, err) + // Wait for all nodes to catch up. + increment(t, db, keyA, 5) + mtc.waitForValues(keyA, []int64{5, 5, 5}) + + // Transfer the lease off of node 0. + mtc.transferLease(ctx, lhsID, 0, 2) + + // Make sure everybody knows about that transfer. + increment(t, db, keyA, 1) + mtc.waitForValues(keyA, []int64{6, 6, 6}) + lhsPartition.activate() + + increment(t, db, keyA, 1) + mtc.waitForValues(keyA, []int64{6, 7, 7}) + return mtc, db, keyA, keyB, lhsID, lhsPartition + } + + // In this case we only have the LHS partitioned. The RHS will learn about its + // identity as the replica in the split and after being re-added will learn + // about the new replica ID and will lay down a tombstone. At this point we'll + // partition the RHS and ensure that the split does not clobber the RHS's hard + // state. + t.Run("no RHS partition", func(t *testing.T) { + mtc, db, keyA, keyB, _, lhsPartition := setup(t) + defer mtc.Stop() + + split(t, db, keyB) + + // Write a value which we can observe to know when the split has been + // applied by the LHS. + increment(t, db, keyA, 1) + mtc.waitForValues(keyA, []int64{6, 8, 8}) + + increment(t, db, keyB, 6) + // Wait for all non-partitioned nodes to catch up. + mtc.waitForValues(keyB, []int64{0, 6, 6}) + + rhsInfo, err := getRangeInfo(ctx, db, keyB) + require.NoError(t, err) + rhsID := rhsInfo.Desc.RangeID + _, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1) + require.True(t, store0Exists) + + // Remove and re-add the RHS to create a new uninitialized replica at + // a higher replica ID. This will lead to a tombstone being written. + require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0)) + // Unsuccessfuly because the RHS will not accept the learner snapshot + // and will be rolled back. Nevertheless it will have learned that it + // has been removed at the old replica ID. + err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + require.True(t, + testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err) + + // Without a partitioned RHS we'll end up always writing a tombstone here because + // the RHS will be created at the initial replica ID because it will get + // raft message when the other nodes split and then after the above call + // it will find out about its new replica ID and write a tombstone for the + // old one. + waitForTombstone(t, mtc.Store(0).Engine(), rhsID) + lhsPartition.deactivate() + mtc.waitForValues(keyA, []int64{8, 8, 8}) + hs := getHardState(t, mtc.Store(0), rhsID) + require.Equal(t, uint64(0), hs.Commit) + testutils.SucceedsSoon(t, func() error { + return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + }) + mtc.waitForValues(keyB, []int64{6, 6, 6}) + }) + + // This case is like the previous case except the store crashes after + // laying down a tombstone. + t.Run("no RHS partition, with restart", func(t *testing.T) { + mtc, db, keyA, keyB, _, lhsPartition := setup(t) + defer mtc.Stop() + + split(t, db, keyB) + + // Write a value which we can observe to know when the split has been + // applied by the LHS. + increment(t, db, keyA, 1) + mtc.waitForValues(keyA, []int64{6, 8, 8}) + + increment(t, db, keyB, 6) + // Wait for all non-partitioned nodes to catch up. + mtc.waitForValues(keyB, []int64{0, 6, 6}) + + rhsInfo, err := getRangeInfo(ctx, db, keyB) + require.NoError(t, err) + rhsID := rhsInfo.Desc.RangeID + _, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1) + require.True(t, store0Exists) + + // Remove and re-add the RHS to create a new uninitialized replica at + // a higher replica ID. This will lead to a tombstone being written. + require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0)) + // Unsuccessfuly because the RHS will not accept the learner snapshot + // and will be rolled back. Nevertheless it will have learned that it + // has been removed at the old replica ID. + err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + require.True(t, + testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err) + + // Without a partitioned RHS we'll end up always writing a tombstone here because + // the RHS will be created at the initial replica ID because it will get + // raft message when the other nodes split and then after the above call + // it will find out about its new replica ID and write a tombstone for the + // old one. + waitForTombstone(t, mtc.Store(0).Engine(), rhsID) + + // We do all of this incrementing to ensure that nobody will ever + // succeed in sending a message the new RHS replica after we restart + // the store. Previously there were races which could happen if we + // stopped the store immediately. Sleeps worked but this feels somehow + // more principled. + curB := int64(6) + for curB < 100 { + curB++ + increment(t, db, keyB, 1) + mtc.waitForValues(keyB, []int64{0, curB, curB}) + } + + // Restart store 0 so that it forgets about the newer replicaID. + mtc.stopStore(0) + mtc.restartStore(0) + + lhsPartition.deactivate() + mtc.waitForValues(keyA, []int64{8, 8, 8}) + hs := getHardState(t, mtc.Store(0), rhsID) + require.Equal(t, uint64(0), hs.Commit) + testutils.SucceedsSoon(t, func() error { + return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + }) + mtc.waitForValues(keyB, []int64{curB, curB, curB}) + }) + + // In this case the RHS will be partitioned from hearing anything about + // the initial replica ID of the RHS after the split. It will learn about + // the higher replica ID and have that higher replica ID in memory when + // the split is processed. We partition the RHS's new replica ID before + // processing the split to ensure that the RHS doesn't get initialized. + t.Run("initial replica RHS partition, no restart", func(t *testing.T) { + mtc, db, keyA, keyB, _, lhsPartition := setup(t) + defer mtc.Stop() + var rhsPartition *mtcPartitionedRange + partitionReplicaOnSplit(t, mtc, keyB, lhsPartition, &rhsPartition) + split(t, db, keyB) + + // Write a value which we can observe to know when the split has been + // applied by the LHS. + increment(t, db, keyA, 1) + mtc.waitForValues(keyA, []int64{6, 8, 8}) + + increment(t, db, keyB, 6) + // Wait for all non-partitioned nodes to catch up. + mtc.waitForValues(keyB, []int64{0, 6, 6}) + + rhsInfo, err := getRangeInfo(ctx, db, keyB) + require.NoError(t, err) + rhsID := rhsInfo.Desc.RangeID + _, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1) + require.True(t, store0Exists) + + // Remove and re-add the RHS to create a new uninitialized replica at + // a higher replica ID. This will lead to a tombstone being written. + require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0)) + // Unsuccessfuly because the RHS will not accept the learner snapshot + // and will be rolled back. Nevertheless it will have learned that it + // has been removed at the old replica ID. + err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + require.True(t, + testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err) + rhsPartition.addReplica(rhsInfo.Desc.NextReplicaID) + + // Ensure that there's no tombstone. + // The RHS on store 0 never should have heard about its original ID. + ensureNoTombstone(t, mtc.Store(0), rhsID) + lhsPartition.deactivate() + mtc.waitForValues(keyA, []int64{8, 8, 8}) + hs := getHardState(t, mtc.Store(0), rhsID) + require.Equal(t, uint64(0), hs.Commit) + // Now succeed in adding the RHS. + testutils.SucceedsSoon(t, func() error { + return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + }) + mtc.waitForValues(keyB, []int64{6, 6, 6}) + }) + + // This case is set up like the previous one except after the RHS learns about + // its higher replica ID the store crahes and forgets. The RHS replica gets + // initialized by the split. + t.Run("initial replica RHS partition, with restart", func(t *testing.T) { + mtc, db, keyA, keyB, _, lhsPartition := setup(t) + defer mtc.Stop() + var rhsPartition *mtcPartitionedRange + + partitionReplicaOnSplit(t, mtc, keyB, lhsPartition, &rhsPartition) + split(t, db, keyB) + + // Write a value which we can observe to know when the split has been + // applied by the LHS. + increment(t, db, keyA, 1) + mtc.waitForValues(keyA, []int64{6, 8, 8}) + + increment(t, db, keyB, 6) + // Wait for all non-partitioned nodes to catch up. + mtc.waitForValues(keyB, []int64{0, 6, 6}) + + rhsInfo, err := getRangeInfo(ctx, db, keyB) + require.NoError(t, err) + rhsID := rhsInfo.Desc.RangeID + _, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1) + require.True(t, store0Exists) + + // Remove and re-add the RHS to create a new uninitialized replica at + // a higher replica ID. This will lead to a tombstone being written. + require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0)) + // Unsuccessfuly because the RHS will not accept the learner snapshot + // and will be rolled back. Nevertheless it will have learned that it + // has been removed at the old replica ID. + err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + require.True(t, + testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err) + // Ensure that there's no tombstone. + // The RHS on store 0 never should have heard about its original ID. + ensureNoTombstone(t, mtc.Store(0), rhsID) + + // Now, before we deactivate the LHS partition, partition the newer replica + // on the RHS too. + rhsPartition.addReplica(rhsInfo.Desc.NextReplicaID) + + // We do all of this incrementing to ensure that nobody will ever + // succeed in sending a message the new RHS replica after we restart + // the store. Previously there were races which could happen if we + // stopped the store immediately. Sleeps worked but this feels somehow + // more principled. + curB := int64(6) + for curB < 100 { + curB++ + increment(t, db, keyB, 1) + mtc.waitForValues(keyB, []int64{0, curB, curB}) + } + + // Restart store 0 so that it forgets about the newer replicaID. + mtc.stopStore(0) + mtc.restartStore(0) + + lhsPartition.deactivate() + mtc.waitForValues(keyA, []int64{8, 8, 8}) + // In this case the store has forgotten that it knew the RHS of the split + // could not exist. We ensure that it has been initialized to the initial + // commit value, which is 10. + testutils.SucceedsSoon(t, func() error { + hs := getHardState(t, mtc.Store(0), rhsID) + if hs.Commit != uint64(10) { + return errors.Errorf("hard state not yet initialized: got %v, expected %v", + hs.Commit, uint64(10)) + } + return nil + }) + rhsPartition.deactivate() + testutils.SucceedsSoon(t, func() error { + return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0) + }) + mtc.waitForValues(keyB, []int64{curB, curB, curB}) + }) +} diff --git a/pkg/storage/client_replica_gc_test.go b/pkg/storage/client_replica_gc_test.go index 7b8e41cf1b36..4162ffca8d85 100644 --- a/pkg/storage/client_replica_gc_test.go +++ b/pkg/storage/client_replica_gc_test.go @@ -148,8 +148,11 @@ func TestReplicaGCQueueDropReplicaDirect(t *testing.T) { // removes a range from a store that no longer should have a replica. func TestReplicaGCQueueDropReplicaGCOnScan(t *testing.T) { defer leaktest.AfterTest(t)() - mtc := &multiTestContext{} + cfg := storage.TestStoreConfig(nil) + cfg.TestingKnobs.DisableEagerReplicaRemoval = true + mtc.storeConfig = &cfg + defer mtc.Stop() mtc.Start(t, 3) // Disable the replica gc queue to prevent direct removal of replica. diff --git a/pkg/storage/client_replica_test.go b/pkg/storage/client_replica_test.go index 48d3203557c6..d2ed5d8af2ea 100644 --- a/pkg/storage/client_replica_test.go +++ b/pkg/storage/client_replica_test.go @@ -2134,6 +2134,226 @@ func TestRandomConcurrentAdminChangeReplicasRequests(t *testing.T) { } } +// TestReplicaTombstone ensures that tombstones are written when we expect +// them to be. +// +// Tombstones are laid down when replicas are removed. +// Replicas are removed for several reasons: +// +// 1) In response to a ChangeReplicasTrigger which removes it. +// 2) In response to a ReplicaTooOldError from a sent raft message. +// 3) Due to the replicaGCQueue. +// 3.1) When this detects a merge. +// 4) Due to a raft message addressed to a newer replica ID. +// 4.1) When the older replica is not initialized. +// 5) Due to a merge. +// 6) Due to snapshot which subsumes a range. +// +// This test creates all of these scenarios and ensures that tombstones are +// written. +func TestReplicaTombstone(t *testing.T) { + defer leaktest.AfterTest(t)() + split := func(t *testing.T, db *client.DB, key roachpb.Key) { + b := &client.Batch{} + b.AddRawRequest(adminSplitArgs(key)) + require.NoError(t, db.Run(context.TODO(), b)) + } + t.Run("ChangeReplicasTrigger", func(t *testing.T) { + defer leaktest.AfterTest(t)() + + sc := storage.TestStoreConfig(nil) + // We'll control replication by hand. + sc.TestingKnobs.DisableReplicateQueue = true + // Avoid fighting with the merge queue while trying to reproduce this race. + sc.TestingKnobs.DisableMergeQueue = true + mtc := &multiTestContext{storeConfig: &sc} + defer mtc.Stop() + mtc.Start(t, 2) + + keyA := roachpb.Key("a") + + repl0 := mtc.stores[0].LookupReplica(roachpb.RKey(keyA)) + rangeID := repl0.RangeID + // Partition node 2 from sending traffic but not from receiving it. + // Replicate the range onto nodes 1 and 2. + mtc.replicateRange(rangeID, 1) + // Partition the range such that it doesn't hear responses but it hears + // everything else. + part, err := setupPartitionedRange(mtc, rangeID, 0, 1, true, dropFuncs{ + dropResp: func(*storage.RaftMessageResponse) bool { + return true + }, + dropReq: func(*storage.RaftMessageRequest) bool { + return false + }, + dropHB: func(*storage.RaftHeartbeat) bool { + return false + }, + }) + defer part.deactivate() + require.NoError(t, err) + mtc.unreplicateRange(rangeID, 1) + tombstone := waitForTombstone(t, mtc.Store(1).Engine(), rangeID) + require.Equal(t, roachpb.ReplicaID(3), tombstone.NextReplicaID) + }) + t.Run("ReplicaTooOldError", func(t *testing.T) { + defer leaktest.AfterTest(t)() + + sc := storage.TestStoreConfig(nil) + // We'll control replication by hand. + sc.TestingKnobs.DisableReplicateQueue = true + // Avoid fighting with the merge queue while trying to reproduce this race. + sc.TestingKnobs.DisableMergeQueue = true + // Send lots of heartbeats. + sc.RaftTickInterval = 10 * time.Millisecond + sc.RangeLeaseRaftElectionTimeoutMultiplier = 1000 + + mtc := &multiTestContext{storeConfig: &sc} + defer mtc.Stop() + mtc.Start(t, 3) + + keyA := roachpb.Key("a") + + repl0 := mtc.stores[0].LookupReplica(roachpb.RKey(keyA)) + rangeID := repl0.RangeID + // Partition node 2 from sending traffic but not from receiving it. + // Replicate the range onto nodes 1 and 2. + mtc.replicateRange(rangeID, 1, 2) + // Partition the range such that it hears responses but does not hear + // responses. It should destroy the local replica due to a + // ReplicaTooOldError. + sawTooOld := make(chan struct{}, 1) + part, err := setupPartitionedRange(mtc, rangeID, 0, 2, true, dropFuncs{ + dropResp: func(resp *storage.RaftMessageResponse) bool { + if resp.ToReplica.StoreID == 3 { + switch val := resp.Union.GetValue().(type) { + case *roachpb.Error: + switch val.GetDetail().(type) { + case *roachpb.ReplicaTooOldError: + select { + case sawTooOld <- struct{}{}: + default: + } + } + } + } + return false + }, + dropReq: func(req *storage.RaftMessageRequest) bool { + return req.ToReplica.StoreID == 3 + }, + dropHB: func(*storage.RaftHeartbeat) bool { + return false + }, + }) + defer part.deactivate() + require.NoError(t, err) + mtc.unreplicateRange(rangeID, 2) + // Write a value to ensure tha the remaining replicas have processed the + // removal. + args := incrementArgs(keyA, 1) + _, pErr := client.SendWrappedWith(context.Background(), mtc.Store(0).TestSender(), + roachpb.Header{RangeID: rangeID}, args) + require.Nil(t, pErr) + mtc.waitForValues(keyA, []int64{1, 1, 0}) + + // Advance the clock to expire all leases. + mtc.advanceClock(context.TODO()) + + // Try to send a request to the partitioned range so that it sends raft + // messages when it attempts to campaign. + _, pErr = client.SendWrappedWith(context.Background(), mtc.Store(2).TestSender(), + roachpb.Header{RangeID: rangeID}, args) + // The replica may return NotLeaseHolderError or may have already discovered + // that it's been removed. + require.True(t, testutils.IsPError(pErr, "NotLeaseHolderError|was not found on s3"), pErr) + // Wait until we're sure that the replica has seen ReplicaTooOld. + <-sawTooOld + // We want to make sure that we actually see + tombstone := waitForTombstone(t, mtc.Store(2).Engine(), rangeID) + require.Equal(t, roachpb.ReplicaID(4), tombstone.NextReplicaID) + }) + t.Run("ReplicaGCQueue (moved)", func(t *testing.T) { + defer leaktest.AfterTest(t)() + + sc := storage.TestStoreConfig(nil) + // We'll control replication by hand. + sc.TestingKnobs.DisableReplicateQueue = true + // Avoid fighting with the merge queue while trying to reproduce this race. + sc.TestingKnobs.DisableMergeQueue = true + + mtc := &multiTestContext{storeConfig: &sc} + defer mtc.Stop() + mtc.Start(t, 3) + + keyA := roachpb.Key("a") + + repl0 := mtc.stores[0].LookupReplica(roachpb.RKey(keyA)) + rangeID := repl0.RangeID + // Partition node 2 from sending traffic but not from receiving it. + // Replicate the range onto nodes 1 and 2. + mtc.replicateRange(rangeID, 1, 2) + // Partition the range such that it doesn't hear anything. + // Use the replica GC queue to remove it. + part, err := setupPartitionedRange(mtc, rangeID, 0, 2, true, dropFuncs{}) + require.NoError(t, err) + defer part.deactivate() + mtc.unreplicateRange(rangeID, 2) + repl, err := mtc.Store(2).GetReplica(rangeID) + require.NoError(t, err) + require.NoError(t, mtc.Store(2).ManualReplicaGC(repl)) + tombstone := waitForTombstone(t, mtc.Store(2).Engine(), rangeID) + require.Equal(t, roachpb.ReplicaID(4), tombstone.NextReplicaID) + }) + // This case also detects the tombstone for nodes which processed the merge. + t.Run("ReplicaGCQueue (merged)", func(t *testing.T) { + defer leaktest.AfterTest(t)() + + sc := storage.TestStoreConfig(nil) + // We'll control replication by hand. + sc.TestingKnobs.DisableReplicateQueue = true + // Avoid fighting with the merge queue while trying to reproduce this race. + sc.TestingKnobs.DisableMergeQueue = true + + mtc := &multiTestContext{storeConfig: &sc} + defer mtc.Stop() + mtc.Start(t, 4) + + scratchTableKey := keys.MakeTablePrefix(math.MaxUint32) + keyA := append(append(roachpb.Key{}, scratchTableKey...), 'a') + keyB := append(append(roachpb.Key{}, scratchTableKey...), 'b') + split(t, mtc.Store(0).DB(), keyA) + split(t, mtc.Store(0).DB(), keyB) + + lhsRepl0 := mtc.stores[0].LookupReplica(roachpb.RKey(keyA)) + lhsID := lhsRepl0.RangeID + mtc.replicateRange(lhsID, 1, 3) + + rhsRepl0 := mtc.stores[0].LookupReplica(roachpb.RKey(keyB)) + rhsID := rhsRepl0.RangeID + mtc.replicateRange(rhsID, 1, 2) + + // Partition the range such that it doesn't hear anything. + // Use the replica GC queue to remove it. + part, err := setupPartitionedRange(mtc, rhsID, 0, 2, true, dropFuncs{}) + require.NoError(t, err) + defer part.deactivate() + mtc.unreplicateRange(rhsID, 2) + mtc.replicateRange(rhsID, 3) + require.NoError(t, mtc.Store(0).DB().AdminMerge(context.TODO(), keyA)) + repl, err := mtc.Store(2).GetReplica(rhsID) + require.NoError(t, err) + require.NoError(t, mtc.Store(2).ManualReplicaGC(repl)) + tombstone := waitForTombstone(t, mtc.Store(2).Engine(), rhsID) + require.Equal(t, roachpb.ReplicaID(math.MaxInt32), tombstone.NextReplicaID) + tombstone = waitForTombstone(t, mtc.Store(3).Engine(), rhsID) + require.Equal(t, roachpb.ReplicaID(math.MaxInt32), tombstone.NextReplicaID) + }) + // TODO(ajwerner): test the subsumption case by blocking all messages after + // the merge has entered its critical phase and then wait the the LHS to + // get a snapshot. +} + // TestAdminRelocateRangeSafety exercises a situation where calls to // AdminRelocateRange can race with calls to ChangeReplicas and verifies // that such races do not leave the range in an under-replicated state. diff --git a/pkg/storage/client_split_test.go b/pkg/storage/client_split_test.go index 6db3d819b5f9..f3cb67ba9fc8 100644 --- a/pkg/storage/client_split_test.go +++ b/pkg/storage/client_split_test.go @@ -3314,8 +3314,12 @@ func TestSplitTriggerMeetsUnexpectedReplicaID(t *testing.T) { // different replicaID than the split trigger expects. add := func() { _, err := tc.AddReplicas(kRHS, tc.Target(1)) - if !testutils.IsError(err, `snapshot intersects existing range`) { - t.Fatalf(`expected snapshot intersects existing range" error got: %+v`, err) + // The "snapshot intersects existing range" error is expected if the store + // has not heard a raft message addressed to a later replica ID while the + // "was not found on" error is expected if the store has heard that it has + // a newer replica ID before receiving the snapshot. + if !testutils.IsError(err, `snapshot intersects existing range|r[0-9]+ was not found on s[0-9]+`) { + t.Fatalf(`expected snapshot intersects existing range|r[0-9]+ was not found on s[0-9]+" error got: %+v`, err) } } for i := 0; i < 5; i++ { @@ -3361,7 +3365,8 @@ func TestSplitTriggerMeetsUnexpectedReplicaID(t *testing.T) { if err != nil { return err } - if desc := repl.Desc(); !descLHS.Equal(desc) { + if desc := repl.Desc(); desc.IsInitialized() && !descLHS.Equal(desc) { + require.NoError(t, store.ManualReplicaGC(repl)) return errors.Errorf("expected %s got %s", &descLHS, desc) } return nil diff --git a/pkg/storage/client_test.go b/pkg/storage/client_test.go index 11c57695ff08..1f1a0803ca95 100644 --- a/pkg/storage/client_test.go +++ b/pkg/storage/client_test.go @@ -1061,8 +1061,8 @@ func (m *multiTestContext) restartStore(i int) { } func (m *multiTestContext) Store(i int) *storage.Store { - m.mu.Lock() - defer m.mu.Unlock() + m.mu.RLock() + defer m.mu.RUnlock() return m.stores[i] } @@ -1243,6 +1243,23 @@ func (m *multiTestContext) unreplicateRangeNonFatal(rangeID roachpb.RangeID, des return err } +// waitForUnreplicated waits until no replica exists for the specified range +// on the dest store. +func (m *multiTestContext) waitForUnreplicated(rangeID roachpb.RangeID, dest int) error { + // Wait for the unreplications to complete on destination node. + return retry.ForDuration(testutils.DefaultSucceedsSoonDuration, func() error { + _, err := m.stores[dest].GetReplica(rangeID) + switch err.(type) { + case nil: + return fmt.Errorf("replica still exists on dest %d", dest) + case *roachpb.RangeNotFoundError: + return nil + default: + return err + } + }) +} + // readIntFromEngines reads the current integer value at the given key // from all configured engines, filling in zeros when the value is not // found. Returns a slice of the same length as mtc.engines. @@ -1551,3 +1568,22 @@ func verifyRecomputedStats( } return nil } + +func waitForTombstone( + t *testing.T, eng engine.Reader, rangeID roachpb.RangeID, +) (tombstone roachpb.RaftTombstone) { + testutils.SucceedsSoon(t, func() error { + tombstoneKey := keys.RaftTombstoneKey(rangeID) + ok, err := engine.MVCCGetProto( + context.TODO(), eng, tombstoneKey, hlc.Timestamp{}, &tombstone, engine.MVCCGetOptions{}, + ) + if err != nil { + t.Fatalf("failed to read tombstone: %v", err) + } + if !ok { + return fmt.Errorf("tombstone not found for range %d", rangeID) + } + return nil + }) + return tombstone +} diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go index 095a00256bd9..7e61e50a1784 100644 --- a/pkg/storage/replica.go +++ b/pkg/storage/replica.go @@ -242,6 +242,7 @@ type Replica struct { syncutil.RWMutex // The destroyed status of a replica indicating if it's alive, corrupt, // scheduled for destruction or has been GCed. + // destroyStatus should only be set while also holding the raftMu. destroyStatus // Is the range quiescent? Quiescent ranges are not Tick()'d and unquiesce // whenever a Raft operation is performed. @@ -345,7 +346,8 @@ type Replica struct { replicaID roachpb.ReplicaID // The minimum allowed ID for this replica. Initialized from // RaftTombstone.NextReplicaID. - minReplicaID roachpb.ReplicaID + tombstoneReplicaID roachpb.ReplicaID + // The ID of the leader replica within the Raft group. Used to determine // when the leadership changes. leaderID roachpb.ReplicaID @@ -610,13 +612,26 @@ func (r *Replica) String() string { return fmt.Sprintf("[n%d,s%d,r%s]", r.store.Ident.NodeID, r.store.Ident.StoreID, &r.rangeStr) } -// ReplicaID returns the ID for the Replica. +// ReplicaID returns the ID for the Replica. It may be zero if the replica does +// not know its ID. Once a Replica has a non-zero ReplicaID it will never change. func (r *Replica) ReplicaID() roachpb.ReplicaID { r.mu.RLock() defer r.mu.RUnlock() return r.mu.replicaID } +// minReplicaID returns the minimum replica ID this replica could ever possibly +// have. If this replica currently knows its replica ID (i.e. ReplicaID() is +// non-zero) then it returns it. Otherwise it returns r.mu.tombstoneReplicaID. +func (r *Replica) minReplicaID() roachpb.ReplicaID { + r.mu.RLock() + defer r.mu.RUnlock() + if r.mu.replicaID != 0 { + return r.mu.replicaID + } + return r.mu.tombstoneReplicaID +} + // cleanupFailedProposal cleans up after a proposal that has failed. It // clears any references to the proposal and releases associated quota. // It requires that both Replica.mu and Replica.raftMu are exclusively held. @@ -1043,6 +1058,62 @@ func (r *Replica) requestCanProceed(rspan roachpb.RSpan, ts hlc.Timestamp) error return mismatchErr } +// isNewerThanSplit is a helper used in split(Pre|Post)Apply to +// determine whether the Replica on the right hand side of the split must +// have been removed from this store after the split. +// +// TODO(ajwerner): Ideally if this store had ever learned that the replica +// created by the split were removed it would not forget that fact. +// There exists one edge case where the store may learn that it should house +// a replica of the same range with a higher replica ID and then forget. +// If the first raft message this store ever receives for the this range +// contains a replica ID higher than the replica ID in the split trigger +// then an in-memory replica at that higher replica ID will be created and +// no tombstone at a lower replica ID will be written. If the server then +// crashes it will forget that it had ever been the higher replica ID. The +// server may then proceeed to process the split and initialize a replica at +// the replica ID implied by the split. This is potentially problematic as +// the replica may have voted as this higher replica ID and when it rediscovers +// the higher replica ID it will delete all of the state corresponding to the +// older replica ID including its hard state which may have been synthesized +// with votes as the newer replica ID. This case tends to be handled safely +// in practice because the replica should only be recieving messages as the +// newer replica ID after it has been added to the range. Prior to learner +// replicas we would only add a store to a range after we've successfully +// applied a pre-emptive snapshot. If the store were to split between the +// preemptive snapshot and the addition then the addition would fail due to +// the conditional put logic. If the store were to then enable learners then +// we're still okay because we won't promote a learner unless we succeed in +// sending a learner snapshot. If we fail to send the replica never becomes +// a voter then its votes don't matter and are safe to discard. +// +// Despite the safety due to the change replicas protocol explained above +// it'd be good to know for sure that a replica ID for a range on a store +// is always monotonically increasing, even across restarts. +// +// See TestProcessSplitAfterRightHandSideHasBeenRemoved. +func (r *Replica) isNewerThanSplit(split *roachpb.SplitTrigger) bool { + r.mu.RLock() + defer r.mu.RUnlock() + return r.isNewerThanSplitRLocked(split) +} + +func (r *Replica) isNewerThanSplitRLocked(split *roachpb.SplitTrigger) bool { + rightDesc, hasRightDesc := split.RightDesc.GetReplicaDescriptor(r.StoreID()) + // If we have written a tombstone for this range then we know that the RHS + // must have already been removed at the split replica ID. + return r.mu.tombstoneReplicaID != 0 || + // If the first raft message we received for the RHS range was for a replica + // ID which is above the replica ID of the split then we would not have + // written a tombstone but we will have a replica ID that will exceed the + // split replica ID. + (r.mu.replicaID > rightDesc.ReplicaID && + // If we're catching up from a preemptive snapshot we won't be in the split. + // and we won't know whether our current replica ID indicates we've been + // removed. + hasRightDesc) +} + // checkBatchRequest verifies BatchRequest validity requirements. In particular, // the batch must have an assigned timestamp, and either all requests must be // read-only, or none. diff --git a/pkg/storage/replica_application_result.go b/pkg/storage/replica_application_result.go index aa3a0116a841..706a45057305 100644 --- a/pkg/storage/replica_application_result.go +++ b/pkg/storage/replica_application_result.go @@ -293,22 +293,43 @@ func (r *Replica) handleComputeChecksumResult(ctx context.Context, cc *storagepb r.computeChecksumPostApply(ctx, *cc) } -func (r *Replica) handleChangeReplicasResult(ctx context.Context, chng *storagepb.ChangeReplicas) { - storeID := r.store.StoreID() - var found bool - for _, rDesc := range chng.Replicas() { - if rDesc.StoreID == storeID { - found = true - break - } +func (r *Replica) handleChangeReplicasResult( + ctx context.Context, chng *storagepb.ChangeReplicas, +) (changeRemovedReplica bool) { + // If this command removes us then we would have set the destroy status + // to destroyReasonRemovalPending which we detect here. + // Note that a replica's destroy status is only ever updated under the + // raftMu and we validated that the replica was not RemovingOrRemoved + // before processing this raft ready. + if ds, _ := r.IsDestroyed(); ds != destroyReasonRemovalPending { + return false // changeRemovedReplica + } + + // If this command removes us then we need to go through the process of + // removing our replica from the store. After this method returns, the code + // should roughly return all the way up to whoever called handleRaftReady + // and this Replica should never be heard from again. We can detect if this + // change removed us by inspecting the replica's destroyStatus. We check the + // destroy status before processing a raft ready so if we find ourselves with + // removal pending at this point then we know that this command must be + // responsible. + if log.V(1) { + log.Infof(ctx, "removing replica due to ChangeReplicasTrigger: %v", chng) } - if !found { - // This wants to run as late as possible, maximizing the chances - // that the other nodes have finished this command as well (since - // processing the removal from the queue looks up the Range at the - // lease holder, being too early here turns this into a no-op). - r.store.replicaGCQueue.AddAsync(ctx, r, replicaGCPriorityRemoved) + + // NB: postDestroyRaftMuLocked requires that the batch which removed the data + // be durably synced to disk, which we have. + // See replicaAppBatch.ApplyToStateMachine(). + if err := r.postDestroyRaftMuLocked(ctx, r.GetMVCCStats()); err != nil { + log.Fatalf(ctx, "failed to run Replica postDestroy: %v", err) + } + + if err := r.store.removeInitializedReplicaRaftMuLocked(ctx, r, chng.Desc.NextReplicaID, RemoveOptions{ + DestroyData: false, // We already destroyed the data when the batch committed. + }); err != nil { + log.Fatalf(ctx, "failed to remove replica: %v", err) } + return true } func (r *Replica) handleRaftLogDeltaResult(ctx context.Context, delta int64) { diff --git a/pkg/storage/replica_application_state_machine.go b/pkg/storage/replica_application_state_machine.go index 83f3cfca75fc..b3f8ca578219 100644 --- a/pkg/storage/replica_application_state_machine.go +++ b/pkg/storage/replica_application_state_machine.go @@ -13,7 +13,6 @@ package storage import ( "context" "fmt" - "math" "time" "github.com/cockroachdb/cockroach/pkg/roachpb" @@ -376,6 +375,9 @@ type replicaAppBatch struct { // triggered a migration to the replica applied state key. If so, this // migration will be performed when the application batch is committed. migrateToAppliedStateKey bool + // changeRemovesReplica tracks whether the command in the batch (there must + // be only one) removes this replica from the range. + changeRemovesReplica bool // Statistics. entries int @@ -515,6 +517,32 @@ func (b *replicaAppBatch) stageWriteBatch(ctx context.Context, cmd *replicatedCm return nil } +// changeRemovesStore returns true if any of the removals in this change have storeID. +func changeRemovesStore( + desc *roachpb.RangeDescriptor, change *storagepb.ChangeReplicas, storeID roachpb.StoreID, +) (removesStore bool) { + curReplica, existsInDesc := desc.GetReplicaDescriptor(storeID) + // NB: if we're catching up from a preemptive snapshot then we won't + // exist in the current descriptor and we can't be removed. + if !existsInDesc { + return false + } + + // NB: We don't use change.Removed() because it will include replicas being + // transitioned to VOTER_OUTGOING. + + // In 19.1 and before we used DeprecatedUpdatedReplicas instead of providing + // a new range descriptor. Check first if this is 19.1 or earlier command which + // uses DeprecatedChangeType and DeprecatedReplica + if change.Desc == nil { + return change.DeprecatedChangeType == roachpb.REMOVE_REPLICA && change.DeprecatedReplica.ReplicaID == curReplica.ReplicaID + } + // In 19.2 and beyond we supply the new range descriptor in the change. + // We know we're removed if we do not appear in the new descriptor. + _, existsInChange := change.Desc.GetReplicaDescriptor(storeID) + return !existsInChange +} + // runPreApplyTriggers runs any triggers that must fire before a command is // applied. It may modify the command's ReplicatedEvalResult. func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicatedCmd) error { @@ -554,27 +582,40 @@ func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicat // cannot be constructed at evaluation time because it differs // on each replica (votes may have already been cast on the // uninitialized replica). Write this new hardstate to the batch too. - // See https://github.com/cockroachdb/cockroach/issues/20629 - splitPreApply(ctx, b.batch, res.Split.SplitTrigger) + // See https://github.com/cockroachdb/cockroach/issues/20629. + // + // Alternatively if we discover that the RHS has already been removed + // from this store, clean up its data. + splitPreApply(ctx, b.batch, res.Split.SplitTrigger, b.r) } if merge := res.Merge; merge != nil { // Merges require the subsumed range to be atomically deleted when the // merge transaction commits. + + // If our range currently has a non-zero replica ID then we know we're + // safe to commit this merge because of the invariants provided to us + // by the merge protocol. Namely if this committed we know that if the + // command committed then all of the replicas in the range descriptor + // are collocated when this command commits. If we do not have a non-zero + // replica ID then the logic in Stage should detect that and destroy our + // preemptive snapshot so we shouldn't ever get here. rhsRepl, err := b.r.store.GetReplica(merge.RightDesc.RangeID) if err != nil { return wrapWithNonDeterministicFailure(err, "unable to get replica for merge") } - // Use math.MaxInt32 as the nextReplicaID as an extra safeguard against creating - // new replicas of the RHS. This isn't required for correctness, since the merge - // protocol should guarantee that no new replicas of the RHS can ever be - // created, but it doesn't hurt to be careful. - const rangeIDLocalOnly = true + + // Use math.MaxInt32 (mergedTombstoneReplicaID) as the nextReplicaID as an + // extra safeguard against creating new replicas of the RHS. This isn't + // required for correctness, since the merge protocol should guarantee that + // no new replicas of the RHS can ever be created, but it doesn't hurt to + // be careful. + const clearRangeIDLocalOnly = true const mustClearRange = false if err := rhsRepl.preDestroyRaftMuLocked( - ctx, b.batch, b.batch, math.MaxInt32, rangeIDLocalOnly, mustClearRange, + ctx, b.batch, b.batch, mergedTombstoneReplicaID, clearRangeIDLocalOnly, mustClearRange, ); err != nil { - return wrapWithNonDeterministicFailure(err, "unable to destroy range before merge") + return wrapWithNonDeterministicFailure(err, "unable to destroy replica before merge") } } @@ -602,6 +643,44 @@ func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicat } } + // Detect if this command will remove us from the range. + // If so we stage the removal of all of our range data into this batch. + // We'll complete the removal when it commits. Later logic detects the + // removal by inspecting the destroy status. + // + // NB: This is the last step in the preApply which durably writes to the + // replica state so that if it removes the replica it removes everything. + if change := res.ChangeReplicas; change != nil && + changeRemovesStore(b.state.Desc, change, b.r.store.StoreID()) { + // Delete all of the local data. We're going to delete this hard state too. + // In order for this to be safe we need code above this to promise that we're + // never going to write hard state in response to a message for a later + // replica (with a different replica ID) to this range state. + // Furthermore we mark the replica as destroyed so that new commands are not + // accepted. The replica will be destroyed in handleChangeReplicas. + // Note that we must be holding the raftMu here because we're in the + // midst of application. + + if !b.r.store.TestingKnobs().DisableEagerReplicaRemoval { + b.r.mu.Lock() + b.r.mu.destroyStatus.Set( + roachpb.NewRangeNotFoundError(b.r.RangeID, b.r.store.StoreID()), + destroyReasonRemovalPending) + b.r.mu.Unlock() + b.changeRemovesReplica = true + if err := b.r.preDestroyRaftMuLocked( + ctx, + b.batch, + b.batch, + change.Desc.NextReplicaID, + false, /* clearRangeIDLocalOnly */ + false, /* mustUseClearRange */ + ); err != nil { + return wrapWithNonDeterministicFailure(err, "unable to destroy replica before removal") + } + } + } + // Provide the command's corresponding logical operations to the Replica's // rangefeed. Only do so if the WriteBatch is non-nil, in which case the // rangefeed requires there to be a corresponding logical operation log or @@ -614,6 +693,7 @@ func (b *replicaAppBatch) runPreApplyTriggers(ctx context.Context, cmd *replicat } else if cmd.raftCmd.LogicalOpLog != nil { log.Fatalf(ctx, "non-nil logical op log with nil write batch: %v", cmd.raftCmd) } + return nil } @@ -671,17 +751,23 @@ func (b *replicaAppBatch) ApplyToStateMachine(ctx context.Context) error { r := b.r r.store.Clock().Update(b.maxTS) - // Add the replica applied state key to the write batch. - if err := b.addAppliedStateKeyToBatch(ctx); err != nil { - return err + // Add the replica applied state key to the write batch if this change + // doesn't remove us. + if !b.changeRemovesReplica { + if err := b.addAppliedStateKeyToBatch(ctx); err != nil { + return err + } } // Apply the write batch to RockDB. Entry application is done without // syncing to disk. The atomicity guarantees of the batch and the fact that // the applied state is stored in this batch, ensure that if the batch ends // up not being durably committed then the entries in this batch will be - // applied again upon startup. - const sync = false + // applied again upon startup. However, if we're removing the replica's data + // then we sync this batch as it is not safe to call postDestroyRaftMuLocked + // before ensuring that the replica's data has been synchronously removed. + // See handleChangeReplicasResult(). + sync := b.changeRemovesReplica if err := b.batch.Commit(sync); err != nil { return wrapWithNonDeterministicFailure(err, "unable to commit Raft entry batch") } @@ -862,7 +948,11 @@ func (sm *replicaStateMachine) ApplySideEffects( // before notifying a potentially waiting client. clearTrivialReplicatedEvalResultFields(cmd.replicatedResult()) if !cmd.IsTrivial() { - shouldAssert := sm.handleNonTrivialReplicatedEvalResult(ctx, *cmd.replicatedResult()) + shouldAssert, isRemoved := sm.handleNonTrivialReplicatedEvalResult(ctx, *cmd.replicatedResult()) + + if isRemoved { + return nil, apply.ErrRemoved + } // NB: Perform state assertion before acknowledging the client. // Some tests (TestRangeStatsInit) assumes that once the store has started // and the first range has a lease that there will not be a later hard-state. @@ -928,7 +1018,7 @@ func (sm *replicaStateMachine) ApplySideEffects( // to pass a replicatedResult that does not imply any side-effects. func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( ctx context.Context, rResult storagepb.ReplicatedEvalResult, -) (shouldAssert bool) { +) (shouldAssert, isRemoved bool) { // Assert that this replicatedResult implies at least one side-effect. if rResult.Equal(storagepb.ReplicatedEvalResult{}) { log.Fatalf(ctx, "zero-value ReplicatedEvalResult passed to handleNonTrivialReplicatedEvalResult") @@ -960,7 +1050,7 @@ func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( // we want to assert that these two states do not diverge. shouldAssert = !rResult.Equal(storagepb.ReplicatedEvalResult{}) if !shouldAssert { - return false + return false, false } if rResult.Split != nil { @@ -1000,7 +1090,7 @@ func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( } if rResult.ChangeReplicas != nil { - sm.r.handleChangeReplicasResult(ctx, rResult.ChangeReplicas) + isRemoved = sm.r.handleChangeReplicasResult(ctx, rResult.ChangeReplicas) rResult.ChangeReplicas = nil } @@ -1012,7 +1102,7 @@ func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( if !rResult.Equal(storagepb.ReplicatedEvalResult{}) { log.Fatalf(ctx, "unhandled field in ReplicatedEvalResult: %s", pretty.Diff(rResult, storagepb.ReplicatedEvalResult{})) } - return true + return true, isRemoved } func (sm *replicaStateMachine) maybeApplyConfChange(ctx context.Context, cmd *replicatedCmd) error { diff --git a/pkg/storage/replica_destroy.go b/pkg/storage/replica_destroy.go index 0fb60c307472..cd3515a5be60 100644 --- a/pkg/storage/replica_destroy.go +++ b/pkg/storage/replica_destroy.go @@ -12,6 +12,8 @@ package storage import ( "context" + "fmt" + "math" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" @@ -29,7 +31,10 @@ type DestroyReason int const ( // The replica is alive. destroyReasonAlive DestroyReason = iota - // The replica has been marked for GC, but hasn't been GCed yet. + // The replica is in the process of being removed but has not been removed + // yet. It exists to avoid races between two threads which may decide to + // destroy a replica (e.g. processing a ChangeReplicasTrigger removing the + // range and receiving a raft message with a higher replica ID). destroyReasonRemovalPending // The replica has been GCed. destroyReasonRemoved @@ -43,15 +48,15 @@ type destroyStatus struct { err error } +func (s destroyStatus) String() string { + return fmt.Sprintf("{%v %d}", s.err, s.reason) +} + func (s *destroyStatus) Set(err error, reason DestroyReason) { s.err = err s.reason = reason } -func (s *destroyStatus) Reset() { - s.Set(nil, destroyReasonAlive) -} - // IsAlive returns true when a replica is alive. func (s destroyStatus) IsAlive() bool { return s.reason == destroyReasonAlive @@ -62,16 +67,30 @@ func (s destroyStatus) Removed() bool { return s.reason == destroyReasonRemoved } +// RemovingOrRemoved returns whether the replica is removed or in the process of +// being removed. +func (s destroyStatus) RemovingOrRemoved() bool { + return s.reason == destroyReasonRemovalPending || s.reason == destroyReasonRemoved +} + +// mergedTombstoneReplicaID is the replica ID written into the tombstone +// for replicas which are part of a range which is known to have been merged. +// This value should prevent any messages from stale replicas of that range from +// ever resurrecting merged replicas. Whenever merging or subsuming a replica we +// know new replicas can never be created so this value is used even if we +// don't know the current replica ID. +const mergedTombstoneReplicaID roachpb.ReplicaID = math.MaxInt32 + func (r *Replica) preDestroyRaftMuLocked( ctx context.Context, reader engine.Reader, writer engine.Writer, nextReplicaID roachpb.ReplicaID, - rangeIDLocalOnly bool, - mustClearRange bool, + clearRangeIDLocalOnly bool, + mustUseClearRange bool, ) error { desc := r.Desc() - err := clearRangeData(desc, reader, writer, rangeIDLocalOnly, mustClearRange) + err := clearRangeData(desc, reader, writer, clearRangeIDLocalOnly, mustUseClearRange) if err != nil { return err } @@ -89,15 +108,17 @@ func (r *Replica) postDestroyRaftMuLocked(ctx context.Context, ms enginepb.MVCCS // // TODO(benesch): we would ideally atomically suggest the compaction with // the deletion of the data itself. - desc := r.Desc() - r.store.compactor.Suggest(ctx, storagepb.SuggestedCompaction{ - StartKey: roachpb.Key(desc.StartKey), - EndKey: roachpb.Key(desc.EndKey), - Compaction: storagepb.Compaction{ - Bytes: ms.Total(), - SuggestedAtNanos: timeutil.Now().UnixNano(), - }, - }) + if ms != (enginepb.MVCCStats{}) { + desc := r.Desc() + r.store.compactor.Suggest(ctx, storagepb.SuggestedCompaction{ + StartKey: roachpb.Key(desc.StartKey), + EndKey: roachpb.Key(desc.EndKey), + Compaction: storagepb.Compaction{ + Bytes: ms.Total(), + SuggestedAtNanos: timeutil.Now().UnixNano(), + }, + }) + } // NB: we need the nil check below because it's possible that we're GC'ing a // Replica without a replicaID, in which case it does not have a sideloaded @@ -115,21 +136,22 @@ func (r *Replica) postDestroyRaftMuLocked(ctx context.Context, ms enginepb.MVCCS } // destroyRaftMuLocked deletes data associated with a replica, leaving a -// tombstone. +// tombstone. The Replica may not be initialized in which case only the +// range ID local data is removed. func (r *Replica) destroyRaftMuLocked(ctx context.Context, nextReplicaID roachpb.ReplicaID) error { startTime := timeutil.Now() ms := r.GetMVCCStats() - batch := r.Engine().NewWriteOnlyBatch() defer batch.Close() + clearRangeIDLocalOnly := !r.IsInitialized() if err := r.preDestroyRaftMuLocked( ctx, r.Engine(), batch, nextReplicaID, - false, /* rangeIDLocalOnly */ - false, /* mustClearRange */ + clearRangeIDLocalOnly, + false, /* mustUseClearRange */ ); err != nil { return err } @@ -148,12 +170,18 @@ func (r *Replica) destroyRaftMuLocked(ctx context.Context, nextReplicaID roachpb if err := r.postDestroyRaftMuLocked(ctx, ms); err != nil { return err } - - log.Infof(ctx, "removed %d (%d+%d) keys in %0.0fms [clear=%0.0fms commit=%0.0fms]", - ms.KeyCount+ms.SysCount, ms.KeyCount, ms.SysCount, - commitTime.Sub(startTime).Seconds()*1000, - preTime.Sub(startTime).Seconds()*1000, - commitTime.Sub(preTime).Seconds()*1000) + if r.IsInitialized() { + log.Infof(ctx, "removed %d (%d+%d) keys in %0.0fms [clear=%0.0fms commit=%0.0fms]", + ms.KeyCount+ms.SysCount, ms.KeyCount, ms.SysCount, + commitTime.Sub(startTime).Seconds()*1000, + preTime.Sub(startTime).Seconds()*1000, + commitTime.Sub(preTime).Seconds()*1000) + } else { + log.Infof(ctx, "removed uninitialized range in %0.0fms [clear=%0.0fms commit=%0.0fms]", + commitTime.Sub(startTime).Seconds()*1000, + preTime.Sub(startTime).Seconds()*1000, + commitTime.Sub(preTime).Seconds()*1000) + } return nil } @@ -188,8 +216,8 @@ func (r *Replica) setTombstoneKey( if nextReplicaID < externalNextReplicaID { nextReplicaID = externalNextReplicaID } - if nextReplicaID > r.mu.minReplicaID { - r.mu.minReplicaID = nextReplicaID + if nextReplicaID > r.mu.tombstoneReplicaID { + r.mu.tombstoneReplicaID = nextReplicaID } r.mu.Unlock() diff --git a/pkg/storage/replica_gc_queue.go b/pkg/storage/replica_gc_queue.go index 649e87d60eaa..81d5da274a77 100644 --- a/pkg/storage/replica_gc_queue.go +++ b/pkg/storage/replica_gc_queue.go @@ -12,7 +12,6 @@ package storage import ( "context" - "math" "time" "github.com/cockroachdb/cockroach/pkg/config" @@ -114,13 +113,13 @@ func newReplicaGCQueue(store *Store, db *client.DB, gossip *gossip.Gossip) *repl // in the past. func (rgcq *replicaGCQueue) shouldQueue( ctx context.Context, now hlc.Timestamp, repl *Replica, _ *config.SystemConfig, -) (bool, float64) { +) (shouldQ bool, prio float64) { + lastCheck, err := repl.GetLastReplicaGCTimestamp(ctx) if err != nil { log.Errorf(ctx, "could not read last replica GC timestamp: %+v", err) return false, 0 } - if _, currentMember := repl.Desc().GetReplicaDescriptor(repl.store.StoreID()); !currentMember { return true, replicaGCPriorityRemoved } @@ -215,10 +214,16 @@ func (rgcq *replicaGCQueue) process( } replyDesc := rs[0] + repl.mu.RLock() + replicaID := repl.mu.replicaID + ticks := repl.mu.ticks + repl.mu.RUnlock() + // Now check whether the replica is meant to still exist. // Maybe it was deleted "under us" by being moved. currentDesc, currentMember := replyDesc.GetReplicaDescriptor(repl.store.StoreID()) - if desc.RangeID == replyDesc.RangeID && currentMember { + sameRange := desc.RangeID == replyDesc.RangeID + if sameRange && currentMember { // This replica is a current member of the raft group. Set the last replica // GC check time to avoid re-processing for another check interval. // @@ -230,15 +235,10 @@ func (rgcq *replicaGCQueue) process( if err := repl.setLastReplicaGCTimestamp(ctx, repl.store.Clock().Now()); err != nil { return err } - } else if desc.RangeID == replyDesc.RangeID { + } else if sameRange { // We are no longer a member of this range, but the range still exists. // Clean up our local data. - repl.mu.RLock() - replicaID := repl.mu.replicaID - ticks := repl.mu.ticks - repl.mu.RUnlock() - if replicaID == 0 { // This is a preemptive replica. GC'ing a preemptive replica is a // good idea if and only if the up-replication that it was a part of @@ -284,13 +284,18 @@ func (rgcq *replicaGCQueue) process( rgcq.metrics.RemoveReplicaCount.Inc(1) log.VEventf(ctx, 1, "destroying local data") + + nextReplicaID := replyDesc.NextReplicaID // Note that this seems racy - we didn't hold any locks between reading // the range descriptor above and deciding to remove the replica - but // we pass in the NextReplicaID to detect situations in which the // replica became "non-gc'able" in the meantime by checking (with raftMu // held throughout) whether the replicaID is still smaller than the - // NextReplicaID. - if err := repl.store.RemoveReplica(ctx, repl, replyDesc.NextReplicaID, RemoveOptions{ + // NextReplicaID. Given non-zero replica IDs don't change, this is only + // possible if we currently think we're processing a pre-emptive snapshot + // but discover in RemoveReplica that this range has since been added and + // knows that. + if err := repl.store.RemoveReplica(ctx, repl, nextReplicaID, RemoveOptions{ DestroyData: true, }); err != nil { return err @@ -328,10 +333,10 @@ func (rgcq *replicaGCQueue) process( } } - // A replica ID of MaxInt32 is written when we know a range to have been - // merged. See the Merge case of runPreApplyTriggers() for details. - const nextReplicaID = math.MaxInt32 - if err := repl.store.RemoveReplica(ctx, repl, nextReplicaID, RemoveOptions{ + // A tombstone is written with a value of mergedTombstoneReplicaID because + // we know the range to have been merged. See the Merge case of + // runPreApplyTriggers() for details. + if err := repl.store.RemoveReplica(ctx, repl, mergedTombstoneReplicaID, RemoveOptions{ DestroyData: true, }); err != nil { return err diff --git a/pkg/storage/replica_init.go b/pkg/storage/replica_init.go index b28a3a33591b..4007bbf11bf9 100644 --- a/pkg/storage/replica_init.go +++ b/pkg/storage/replica_init.go @@ -150,26 +150,24 @@ func (r *Replica) initRaftMuLockedReplicaMuLocked( } r.rangeStr.store(replicaID, r.mu.state.Desc) r.connectionClass.set(rpc.ConnectionClassForKey(desc.StartKey)) - if err := r.setReplicaIDRaftMuLockedMuLocked(replicaID); err != nil { - return err + if r.mu.replicaID == 0 { + if err := r.setReplicaIDRaftMuLockedMuLocked(ctx, replicaID); err != nil { + return err + } + } else if r.mu.replicaID != replicaID { + log.Fatalf(ctx, "attempting to initialize a replica which has ID %d with ID %d", + r.mu.replicaID, replicaID) } - r.assertStateLocked(ctx, r.store.Engine()) return nil } -func (r *Replica) setReplicaID(replicaID roachpb.ReplicaID) error { - r.raftMu.Lock() - defer r.raftMu.Unlock() - r.mu.Lock() - defer r.mu.Unlock() - return r.setReplicaIDRaftMuLockedMuLocked(replicaID) -} - -func (r *Replica) setReplicaIDRaftMuLockedMuLocked(replicaID roachpb.ReplicaID) error { - if r.mu.replicaID == replicaID { - // The common case: the replica ID is unchanged. - return nil +func (r *Replica) setReplicaIDRaftMuLockedMuLocked( + ctx context.Context, replicaID roachpb.ReplicaID, +) error { + if r.mu.replicaID != 0 { + log.Fatalf(ctx, "cannot set replica ID from anything other than 0, currently %d", + r.mu.replicaID) } if replicaID == 0 { // If the incoming message does not have a new replica ID it is a @@ -177,23 +175,17 @@ func (r *Replica) setReplicaIDRaftMuLockedMuLocked(replicaID roachpb.ReplicaID) // accepted. return nil } - if replicaID < r.mu.minReplicaID { + if replicaID < r.mu.tombstoneReplicaID { return &roachpb.RaftGroupDeletedError{} } if r.mu.replicaID > replicaID { return errors.Errorf("replicaID cannot move backwards from %d to %d", r.mu.replicaID, replicaID) } - - if r.mu.destroyStatus.reason == destroyReasonRemovalPending { - // An earlier incarnation of this replica was removed, but apparently it has been re-added - // now, so reset the status. - r.mu.destroyStatus.Reset() + if r.mu.destroyStatus.RemovingOrRemoved() { + // This replica has been marked for removal and we're trying to resurrect it. + log.Fatalf(ctx, "cannot resurect replica %d", r.mu.replicaID) } - // if r.mu.replicaID != 0 { - // // TODO(bdarnell): clean up previous raftGroup (update peers) - // } - // Initialize or update the sideloaded storage. If the sideloaded storage // already exists (which is iff the previous replicaID was non-zero), then // we have to move the contained files over (this corresponds to the case in @@ -220,24 +212,12 @@ func (r *Replica) setReplicaIDRaftMuLockedMuLocked(replicaID roachpb.ReplicaID) return errors.Wrap(err, "while initializing sideloaded storage") } - previousReplicaID := r.mu.replicaID r.mu.replicaID = replicaID - if replicaID >= r.mu.minReplicaID { - r.mu.minReplicaID = replicaID + 1 - } - // Reset the raft group to force its recreation on next usage. - r.mu.internalRaftGroup = nil - - // If there was a previous replica, repropose its pending commands under - // this new incarnation. - if previousReplicaID != 0 { - if log.V(1) { - log.Infof(r.AnnotateCtx(context.TODO()), "changed replica ID from %d to %d", - previousReplicaID, replicaID) - } - // repropose all pending commands under new replicaID. - r.refreshProposalsLocked(0, reasonReplicaIDChanged) + // Sanity check that we do not already have a raft group as we did not + // know our replica ID before this call. + if r.mu.internalRaftGroup != nil { + log.Fatalf(ctx, "somehow had an initialized raft group on a zero valued replica") } return nil @@ -270,8 +250,12 @@ func (r *Replica) maybeInitializeRaftGroup(ctx context.Context) { // If this replica hasn't initialized the Raft group, create it and // unquiesce and wake the leader to ensure the replica comes up to date. initialized := r.mu.internalRaftGroup != nil + // If this replica has been removed or is in the process of being removed + // then it'll never handle any raft events so there's no reason to initialize + // it now. + removed := !r.mu.destroyStatus.IsAlive() r.mu.RUnlock() - if initialized { + if initialized || removed { return } @@ -281,9 +265,11 @@ func (r *Replica) maybeInitializeRaftGroup(ctx context.Context) { r.mu.Lock() defer r.mu.Unlock() + // If we raced on checking the destroyStatus above that's fine as + // the below withRaftGroupLocked will no-op. if err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { return true, nil - }); err != nil { + }); err != nil && err != errRemoved { log.VErrEventf(ctx, 1, "unable to initialize raft group: %s", err) } } diff --git a/pkg/storage/replica_learner_test.go b/pkg/storage/replica_learner_test.go index 6f2a5ff6825a..9e9072af9e6f 100644 --- a/pkg/storage/replica_learner_test.go +++ b/pkg/storage/replica_learner_test.go @@ -429,13 +429,13 @@ func TestReplicaGCQueueSeesLearnerOrJointConfig(t *testing.T) { require.Contains(t, tracing.FormatRecordedSpans(trace), msg) return tc.LookupRangeOrFatal(t, scratchStartKey) } - desc := checkNoGC() // Make sure it didn't collect the learner. require.NotEmpty(t, desc.Replicas().Learners()) // Now get the range into a joint config. tc.RemoveReplicasOrFatal(t, scratchStartKey, tc.Target(1)) // remove learner + ltk.withStopAfterJointConfig(func() { desc = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) require.Len(t, desc.Replicas().Filter(predIncoming), 1, desc) @@ -547,10 +547,14 @@ func TestLearnerAdminChangeReplicasRace(t *testing.T) { // Unblock the snapshot, and surprise AddReplicas. It should retry and error // that the descriptor has changed since the AdminChangeReplicas command - // started. + // started. Alternatively it may fail in sending the snapshot because of a + // "raft group deleted" error if the newly added learner attempts to send + // a raft message to another node after it has been removed and then destroys + // itself in response to a ReplicaTooOldError. close(blockSnapshotsCh) - if err := g.Wait(); !testutils.IsError(err, `descriptor changed`) { - t.Fatalf(`expected "descriptor changed" error got: %+v`, err) + const msgRE = `descriptor changed|raft group deleted` + if err := g.Wait(); !testutils.IsError(err, msgRE) { + t.Fatalf(`expected %q error got: %+v`, msgRE, err) } desc = tc.LookupRangeOrFatal(t, scratchStartKey) require.Len(t, desc.Replicas().Voters(), 1) diff --git a/pkg/storage/replica_raft.go b/pkg/storage/replica_raft.go index 52b577da569c..e5744c064f9a 100644 --- a/pkg/storage/replica_raft.go +++ b/pkg/storage/replica_raft.go @@ -379,6 +379,8 @@ func (r *Replica) hasPendingProposalsRLocked() bool { return r.numPendingProposalsRLocked() > 0 } +var errRemoved = errors.New("replica removed") + // stepRaftGroup calls Step on the replica's RawNode with the provided request's // message. Before doing so, it assures that the replica is unquiesced and ready // to handle the request. @@ -445,13 +447,11 @@ func (r *Replica) handleRaftReadyRaftMuLocked( var hasReady bool var rd raft.Ready r.mu.Lock() - lastIndex := r.mu.lastIndex // used for append below lastTerm := r.mu.lastTerm raftLogSize := r.mu.raftLogSize leaderID := r.mu.leaderID lastLeaderID := leaderID - err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { if err := r.mu.proposalBuf.FlushLockedWithRaftGroup(raftGroup); err != nil { return false, err @@ -462,11 +462,13 @@ func (r *Replica) handleRaftReadyRaftMuLocked( return hasReady /* unquiesceAndWakeLeader */, nil }) r.mu.Unlock() - if err != nil { + if err == errRemoved { + // If we've been removed then just return. + return stats, "", nil + } else if err != nil { const expl = "while checking raft group for Ready" return stats, expl, errors.Wrap(err, expl) } - if !hasReady { // We must update the proposal quota even if we don't have a ready. // Consider the case when our quota is of size 1 and two out of three @@ -723,7 +725,6 @@ func (r *Replica) handleRaftReadyRaftMuLocked( // Might have gone negative if node was recently restarted. raftLogSize = 0 } - } // Update protected state - last index, last term, raft log size, and raft @@ -756,10 +757,18 @@ func (r *Replica) handleRaftReadyRaftMuLocked( applicationStart := timeutil.Now() if len(rd.CommittedEntries) > 0 { - if err := appTask.ApplyCommittedEntries(ctx); err != nil { + err := appTask.ApplyCommittedEntries(ctx) + stats.applyCommittedEntriesStats = sm.moveStats() + switch err { + case nil: + case apply.ErrRemoved: + // We know that our replica has been removed. We also know that we've + // stepped the state machine up to the point where it's been removed. + // TODO(ajwerner): decide if there's more to be done here. + return stats, "", err + default: return stats, err.(*nonDeterministicFailure).safeExpl, err } - stats.applyCommittedEntriesStats = sm.moveStats() // etcd raft occasionally adds a nil entry (our own commands are never // empty). This happens in two situations: When a new leader is elected, and @@ -789,11 +798,12 @@ func (r *Replica) handleRaftReadyRaftMuLocked( r.mu.Unlock() } - // TODO(bdarnell): need to check replica id and not Advance if it - // has changed. Or do we need more locking to guarantee that replica - // ID cannot change during handleRaftReady? + // NB: if we just processed a command which removed this replica from the + // raft group we will early return before this point. This, combined with + // the fact that we'll refuse to process messages intended for a higher + // replica ID ensures that our replica ID could not have changed. const expl = "during advance" - if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { + err = r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { raftGroup.Advance(rd) // If the Raft group still has more to process then we immediately @@ -804,7 +814,8 @@ func (r *Replica) handleRaftReadyRaftMuLocked( r.store.enqueueRaftUpdateCheck(r.RangeID) } return true, nil - }); err != nil { + }) + if err != nil { return stats, expl, errors.Wrap(err, expl) } @@ -836,9 +847,14 @@ func splitMsgApps(msgs []raftpb.Message) (msgApps, otherMsgs []raftpb.Message) { return msgs[:splitIdx], msgs[splitIdx:] } -func fatalOnRaftReadyErr(ctx context.Context, expl string, err error) { - // Mimic the behavior in processRaft. - log.Fatalf(ctx, "%s: %+v", log.Safe(expl), err) // TODO(bdarnell) +// maybeFatalOnRaftReadyErr will fatal if err is neither nil nor +// apply.ErrRemoved. +func maybeFatalOnRaftReadyErr(ctx context.Context, expl string, err error) { + switch err { + case nil, apply.ErrRemoved: + default: + log.FatalfDepth(ctx, 1, "%s: %+v", log.Safe(expl), err) // TODO(bdarnell) + } } // tick the Raft group, returning true if the raft group exists and is @@ -1157,7 +1173,7 @@ func (r *Replica) sendRaftMessage(ctx context.Context, msg raftpb.Message) { r.mu.droppedMessages++ raftGroup.ReportUnreachable(msg.To) return true, nil - }); err != nil { + }); err != nil && err != errRemoved { log.Fatal(ctx, err) } } @@ -1200,7 +1216,7 @@ func (r *Replica) reportSnapshotStatus(ctx context.Context, to roachpb.ReplicaID if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { raftGroup.ReportSnapshot(uint64(to), snapStatus) return true, nil - }); err != nil { + }); err != nil && err != errRemoved { log.Fatal(ctx, err) } } @@ -1322,13 +1338,15 @@ func (s pendingCmdSlice) Less(i, j int) bool { // varies. // // Requires that Replica.mu is held. +// +// If this Replica is in the process of being removed this method will return +// errRemoved. func (r *Replica) withRaftGroupLocked( mayCampaignOnWake bool, f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error), ) error { - if r.mu.destroyStatus.Removed() { - // Silently ignore all operations on destroyed replicas. We can't return an - // error here as all errors returned from this method are considered fatal. - return nil + if r.mu.destroyStatus.RemovingOrRemoved() { + // Callers know to detect errRemoved as non-fatal. + return errRemoved } if r.mu.replicaID == 0 { @@ -1378,6 +1396,9 @@ func (r *Replica) withRaftGroupLocked( // should not initiate an election while handling incoming raft // messages (which may include MsgVotes from an election in progress, // and this election would be disrupted if we started our own). +// +// If this Replica is in the process of being removed this method will return +// errRemoved. func (r *Replica) withRaftGroup( mayCampaignOnWake bool, f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error), ) error { @@ -1536,6 +1557,9 @@ func (r *Replica) maybeAcquireSnapshotMergeLock( // not be applied yet) and acquires the split or merge lock if // necessary (in addition to other preparation). It returns a function // which will release any lock acquired (or nil). +// +// After this method returns successfully the RHS of the split or merge +// is guaranteed to exist in the Store using GetReplica(). func (r *Replica) maybeAcquireSplitMergeLock( ctx context.Context, raftCmd storagepb.RaftCommand, ) (func(), error) { diff --git a/pkg/storage/replica_raftstorage.go b/pkg/storage/replica_raftstorage.go index 7132eae707e0..a3ab8c4650bb 100644 --- a/pkg/storage/replica_raftstorage.go +++ b/pkg/storage/replica_raftstorage.go @@ -698,7 +698,6 @@ func clearRangeData( } else { keyRanges = rditer.MakeAllKeyRanges(desc) } - var clearRangeFn func(engine.Reader, engine.Writer, engine.MVCCKey, engine.MVCCKey) error if mustClearRange { clearRangeFn = func(eng engine.Reader, writer engine.Writer, start, end engine.MVCCKey) error { @@ -894,8 +893,7 @@ func (r *Replica) applySnapshot( // problematic, as it would prevent this store from ever having a new replica // of the removed range. In this case, however, it's copacetic, as subsumed // ranges _can't_ have new replicas. - const subsumedNextReplicaID = math.MaxInt32 - if err := r.clearSubsumedReplicaDiskData(ctx, inSnap.SSSS, s.Desc, subsumedRepls, subsumedNextReplicaID); err != nil { + if err := r.clearSubsumedReplicaDiskData(ctx, inSnap.SSSS, s.Desc, subsumedRepls, mergedTombstoneReplicaID); err != nil { return err } stats.subsumedReplicas = timeutil.Now() @@ -915,7 +913,7 @@ func (r *Replica) applySnapshot( // has not yet been updated. Any errors past this point must therefore be // treated as fatal. - if err := r.clearSubsumedReplicaInMemoryData(ctx, subsumedRepls, subsumedNextReplicaID); err != nil { + if err := r.clearSubsumedReplicaInMemoryData(ctx, subsumedRepls, mergedTombstoneReplicaID); err != nil { log.Fatalf(ctx, "failed to clear in-memory data of subsumed replicas while applying snapshot: %+v", err) } @@ -1013,7 +1011,7 @@ func (r *Replica) clearSubsumedReplicaDiskData( r.store.Engine(), &subsumedReplSST, subsumedNextReplicaID, - true, /* rangeIDLocalOnly */ + true, /* clearRangeIDLocalOnly */ true, /* mustClearRange */ ); err != nil { subsumedReplSST.Close() @@ -1101,7 +1099,7 @@ func (r *Replica) clearSubsumedReplicaInMemoryData( // acquisition leaves the store in a consistent state, and access to the // replicas themselves is protected by their raftMus, which are held from // start to finish. - if err := r.store.removeReplicaImpl(ctx, sr, subsumedNextReplicaID, RemoveOptions{ + if err := r.store.removeInitializedReplicaRaftMuLocked(ctx, sr, subsumedNextReplicaID, RemoveOptions{ DestroyData: false, // data is already destroyed }); err != nil { return err diff --git a/pkg/storage/replica_rangefeed_test.go b/pkg/storage/replica_rangefeed_test.go index e5c0bcb7362e..7634874de2a2 100644 --- a/pkg/storage/replica_rangefeed_test.go +++ b/pkg/storage/replica_rangefeed_test.go @@ -600,16 +600,18 @@ func TestReplicaRangefeedRetryErrors(t *testing.T) { mtc.transport.Listen(partitionStore.Ident.StoreID, &unreliableRaftHandler{ rangeID: rangeID, RaftMessageHandler: partitionStore, - dropReq: func(req *storage.RaftMessageRequest) bool { - // Make sure that even going forward no MsgApp for what we just truncated can - // make it through. The Raft transport is asynchronous so this is necessary - // to make the test pass reliably. - // NB: the Index on the message is the log index that _precedes_ any of the - // entries in the MsgApp, so filter where msg.Index < index, not <= index. - return req.Message.Type == raftpb.MsgApp && req.Message.Index < index + dropFuncs: dropFuncs{ + dropReq: func(req *storage.RaftMessageRequest) bool { + // Make sure that even going forward no MsgApp for what we just truncated can + // make it through. The Raft transport is asynchronous so this is necessary + // to make the test pass reliably. + // NB: the Index on the message is the log index that _precedes_ any of the + // entries in the MsgApp, so filter where msg.Index < index, not <= index. + return req.Message.Type == raftpb.MsgApp && req.Message.Index < index + }, + dropHB: func(*storage.RaftHeartbeat) bool { return false }, + dropResp: func(*storage.RaftMessageResponse) bool { return false }, }, - dropHB: func(*storage.RaftHeartbeat) bool { return false }, - dropResp: func(*storage.RaftMessageResponse) bool { return false }, }) // Check the error. diff --git a/pkg/storage/replica_stats.go b/pkg/storage/replica_stats.go index c1157f0a0f43..61fdc801219d 100644 --- a/pkg/storage/replica_stats.go +++ b/pkg/storage/replica_stats.go @@ -102,6 +102,22 @@ func (rs *replicaStats) splitRequestCounts(other *replicaStats) { } } +// halve is called when a split occurs but the RHS has been removed already +// and cannot be initialized with splitRequestCounts. +func (rs *replicaStats) halve() { + rs.mu.Lock() + defer rs.mu.Unlock() + for i := range rs.mu.requests { + if rs.mu.requests[i] == nil { + continue + } + for k := range rs.mu.requests[i] { + newVal := rs.mu.requests[i][k] / 2.0 + rs.mu.requests[i][k] = newVal + } + } +} + func (rs *replicaStats) record(nodeID roachpb.NodeID) { rs.recordCount(1, nodeID) } diff --git a/pkg/storage/replica_test.go b/pkg/storage/replica_test.go index 05490d143468..a187aa8bc467 100644 --- a/pkg/storage/replica_test.go +++ b/pkg/storage/replica_test.go @@ -6568,7 +6568,7 @@ func TestReplicaDestroy(t *testing.T) { func() { tc.repl.raftMu.Lock() defer tc.repl.raftMu.Unlock() - if err := tc.store.removeReplicaImpl(ctx, tc.repl, origDesc.NextReplicaID, RemoveOptions{ + if err := tc.store.removeInitializedReplicaRaftMuLocked(ctx, tc.repl, origDesc.NextReplicaID, RemoveOptions{ DestroyData: true, }); !testutils.IsError(err, expectedErr) { t.Fatalf("expected error %q but got %v", expectedErr, err) @@ -6579,7 +6579,7 @@ func TestReplicaDestroy(t *testing.T) { func() { tc.repl.raftMu.Lock() defer tc.repl.raftMu.Unlock() - if err := tc.store.removeReplicaImpl(ctx, tc.repl, repl.Desc().NextReplicaID, RemoveOptions{ + if err := tc.store.removeInitializedReplicaRaftMuLocked(ctx, tc.repl, repl.Desc().NextReplicaID, RemoveOptions{ DestroyData: true, }); err != nil { t.Fatal(err) @@ -6671,7 +6671,7 @@ func TestQuotaPoolAccessOnDestroyedReplica(t *testing.T) { func() { tc.repl.raftMu.Lock() defer tc.repl.raftMu.Unlock() - if err := tc.store.removeReplicaImpl(ctx, repl, repl.Desc().NextReplicaID, RemoveOptions{ + if err := tc.store.removeInitializedReplicaRaftMuLocked(ctx, repl, repl.Desc().NextReplicaID, RemoveOptions{ DestroyData: true, }); err != nil { t.Fatal(err) @@ -7278,117 +7278,6 @@ func TestSyncSnapshot(t *testing.T) { } } -// TestReplicaIDChangePending verifies that on a replica ID change, pending -// commands are re-proposed on the new raft group. -func TestReplicaIDChangePending(t *testing.T) { - defer leaktest.AfterTest(t)() - - tc := testContext{} - cfg := TestStoreConfig(nil) - // Disable ticks to avoid automatic reproposals after a timeout, which - // would pass this test. - cfg.RaftTickInterval = math.MaxInt32 - stopper := stop.NewStopper() - defer stopper.Stop(context.TODO()) - tc.StartWithStoreConfig(t, stopper, cfg) - repl := tc.repl - - // Stop the command from being proposed to the raft group and being removed. - proposedOnOld := make(chan struct{}, 1) - repl.mu.Lock() - repl.mu.proposalBuf.testing.submitProposalFilter = func(*ProposalData) (drop bool, _ error) { - select { - case proposedOnOld <- struct{}{}: - default: - } - return true, nil - } - lease := *repl.mu.state.Lease - repl.mu.Unlock() - - // Add a command to the pending list and wait for it to be proposed. - magicTS := tc.Clock().Now() - ba := roachpb.BatchRequest{} - ba.Timestamp = magicTS - ba.Add(&roachpb.PutRequest{ - RequestHeader: roachpb.RequestHeader{ - Key: roachpb.Key("a"), - }, - Value: roachpb.MakeValueFromBytes([]byte("val")), - }) - _, _, _, err := repl.evalAndPropose(context.Background(), lease, &ba, &allSpans, endCmds{}) - if err != nil { - t.Fatal(err) - } - <-proposedOnOld - - // Set the raft command handler so we can tell if the command has been - // re-proposed. - proposedOnNew := make(chan struct{}, 1) - repl.mu.Lock() - repl.mu.proposalBuf.testing.submitProposalFilter = func(p *ProposalData) (drop bool, _ error) { - if p.Request.Timestamp == magicTS { - select { - case proposedOnNew <- struct{}{}: - default: - } - } - return false, nil - } - repl.mu.Unlock() - - // Set the ReplicaID on the replica. - if err := repl.setReplicaID(2); err != nil { - t.Fatal(err) - } - - <-proposedOnNew -} - -func TestSetReplicaID(t *testing.T) { - defer leaktest.AfterTest(t)() - - tsc := TestStoreConfig(nil) - tc := testContext{} - stopper := stop.NewStopper() - defer stopper.Stop(context.TODO()) - tc.StartWithStoreConfig(t, stopper, tsc) - - repl := tc.repl - - testCases := []struct { - replicaID roachpb.ReplicaID - minReplicaID roachpb.ReplicaID - newReplicaID roachpb.ReplicaID - expectedMinReplicaID roachpb.ReplicaID - expectedErr string - }{ - {0, 0, 1, 2, ""}, - {0, 1, 1, 2, ""}, - {0, 2, 1, 2, "raft group deleted"}, - {1, 2, 1, 2, ""}, // not an error; replicaID == newReplicaID is checked first - {2, 0, 1, 0, "replicaID cannot move backwards"}, - } - for _, c := range testCases { - t.Run("", func(t *testing.T) { - repl.mu.Lock() - repl.mu.replicaID = c.replicaID - repl.mu.minReplicaID = c.minReplicaID - repl.mu.Unlock() - - err := repl.setReplicaID(c.newReplicaID) - repl.mu.Lock() - if repl.mu.minReplicaID != c.expectedMinReplicaID { - t.Errorf("expected minReplicaID=%d, but found %d", c.expectedMinReplicaID, repl.mu.minReplicaID) - } - repl.mu.Unlock() - if !testutils.IsError(err, c.expectedErr) { - t.Fatalf("expected %q, but found %v", c.expectedErr, err) - } - }) - } -} - func TestReplicaRetryRaftProposal(t *testing.T) { defer leaktest.AfterTest(t)() diff --git a/pkg/storage/store.go b/pkg/storage/store.go index 99fa19230a5a..e63879c23cc1 100644 --- a/pkg/storage/store.go +++ b/pkg/storage/store.go @@ -1142,7 +1142,7 @@ func (s *Store) IsStarted() bool { return atomic.LoadInt32(&s.started) == 1 } -// IterateIDPrefixKeys helps visit system keys that use RangeID prefixing ( such as +// IterateIDPrefixKeys helps visit system keys that use RangeID prefixing (such as // RaftHardStateKey, RaftTombstoneKey, and many others). Such keys could in principle exist at any // RangeID, and this helper efficiently discovers all the keys of the desired type (as specified by // the supplied `keyFn`) and, for each key-value pair discovered, unmarshals it into `msg` and then @@ -2088,10 +2088,72 @@ func (s *Store) AllocateRangeID(ctx context.Context) (roachpb.RangeID, error) { // splitPreApply is called when the raft command is applied. Any // changes to the given ReadWriter will be written atomically with the // split commit. -func splitPreApply(ctx context.Context, eng engine.ReadWriter, split roachpb.SplitTrigger) { +func splitPreApply( + ctx context.Context, eng engine.ReadWriter, split roachpb.SplitTrigger, r *Replica, +) { + // Check on the RHS, we need to ensure that it exists and has a minReplicaID + // less than or equal to the replica we're about to initialize. + // + // The right hand side of the split was already created (and its raftMu + // acquired) in Replica.acquireSplitLock. It must be present here. + rightRepl, err := r.store.GetReplica(split.RightDesc.RangeID) + if err != nil { + log.Fatalf(ctx, "unable to find RHS replica: %+v", err) + } + + // If the RHS is not in the split, sanity check that the LHS is currently + // catching up from a preemptive snapshot. + _, hasRightDesc := split.RightDesc.GetReplicaDescriptor(r.StoreID()) + if !hasRightDesc { + _, lhsExists := r.Desc().GetReplicaDescriptor(r.StoreID()) + if lhsExists { + log.Fatalf(ctx, "cannot process split on s%s which exists in LHS and not in RHS: %+v", + r.StoreID(), split) + } + } + + // Check to see if we know that the RHS has already been removed from this + // store at the replica ID implied by the split. + if rightRepl.isNewerThanSplit(&split) { + // We're in the rare case where we know that the RHS has been removed + // and re-added with a higher replica ID. We know we've never processed a + // snapshot for the right range because up to this point it would overlap + // with the left and ranges cannot move rightwards. + // + // It is important to preserve the HardState because we might however have + // already voted at a higher term. In general this shouldn't happen because + // we add learners and then promote them only after we snapshot but we're + // going to be extra careful in case future versions of cockroach somehow + // promote replicas without ensuring that a snapshot has been received. + // + // Clear the user data the RHS would have inherited from the LHS due to the + // split and additionally clear all of the range ID local state that the + // split trigger writes into the RHS. + // + // Rather than specifically deleting around the data we want to preserve + // we read the HardState to preserve it, clear everything and write back + // the HardState and tombstone. + hs, err := rightRepl.raftMu.stateLoader.LoadHardState(ctx, eng) + if err != nil { + log.Fatalf(ctx, "failed to load hard state for removed rhs: %v", err) + } + const rangeIDLocalOnly = false + const mustUseClearRange = false + if err := clearRangeData(&split.RightDesc, eng, eng, rangeIDLocalOnly, mustUseClearRange); err != nil { + log.Fatalf(ctx, "failed to clear range data for removed rhs: %v", err) + } + if err := rightRepl.raftMu.stateLoader.SetHardState(ctx, eng, hs); err != nil { + log.Fatalf(ctx, "failed to set hard state with 0 commit index for removed rhs: %v", err) + } + if err := r.setTombstoneKey(ctx, eng, r.minReplicaID()); err != nil { + log.Fatalf(ctx, "failed to set tombstone for removed rhs: %v", err) + } + return + } + // Update the raft HardState with the new Commit value now that the // replica is initialized (combining it with existing or default - // Term and Vote). + // Term and Vote). This is the common case. rsl := stateloader.Make(split.RightDesc.RangeID) if err := rsl.SynthesizeRaftState(ctx, eng); err != nil { log.Fatal(ctx, err) @@ -2112,146 +2174,70 @@ func splitPostApply( if err != nil { log.Fatalf(ctx, "unable to find RHS replica: %+v", err) } - { - // Already holding raftMu, see above. - rightRng.mu.Lock() - // The right hand side of the split may have been removed and re-added - // in the meantime, and the replicaID in RightDesc may be stale. - // Consequently the call below may fail with a RaftGroupDeletedError. In - // general, this protects earlier incarnations of the replica that were - // since replicaGC'ed from reneging on promises made earlier (for - // example, once the HardState is removed, a replica could cast a - // different vote for the same term). - // - // It is safe to circumvent that in this case because the RHS must have - // remained uninitialized (the LHS blocks all user data, and since our - // Raft logs start at a nonzero index a snapshot must go through before - // any log entries are appended). This means the data in that range is - // just a HardState which we "logically" snapshot by assigning it data - // formerly located within the LHS. - // - // Note that if we ever have a way to replicaGC uninitialized replicas, - // the RHS may have been gc'ed and so the HardState would be gone. In - // that case, the requirement that the HardState remains would have been - // violated if the bypass below were used, which is why we place an - // assertion. - // - // See: - // https://github.com/cockroachdb/cockroach/issues/21146#issuecomment-365757329 - // - // TODO(tbg): this argument is flawed - it's possible for a tombstone - // to exist on the RHS: - // https://github.com/cockroachdb/cockroach/issues/40470 - // Morally speaking, this means that we should throw away the data we - // moved from the LHS to the RHS (depending on the tombstone). - // Realistically speaking it will probably be easier to create the RHS - // anyway, even though there's a tombstone and it may just get gc'ed - // again. Note that for extra flavor, we may not even know whether the - // RHS is currently supposed to exist or not, lending more weight to the - // second approach. - tombstoneKey := keys.RaftTombstoneKey(rightRng.RangeID) - var tombstone roachpb.RaftTombstone - if ok, err := engine.MVCCGetProto( - ctx, r.store.Engine(), tombstoneKey, hlc.Timestamp{}, &tombstone, engine.MVCCGetOptions{}, - ); err != nil { - log.Fatalf(ctx, "unable to load tombstone for RHS: %+v", err) - } else if ok { - log.Fatalf(ctx, "split trigger found right-hand side with tombstone %+v: %v", tombstone, rightRng) - } - rightDesc, ok := split.RightDesc.GetReplicaDescriptor(r.StoreID()) - if !ok { - // This is yet another special quirky case. The local store is not - // necessarily a member of the split; this can occur if this store - // wasn't a member at the time of the split, but is nevertheless - // catching up across the split. For example, add a learner, and - // while it is being caught up via a snapshot, remove the learner - // again, then execute a split, and re-add it. Upon being re-added - // the learner will likely catch up from where the snapshot left it, - // and it will see itself get removed, then we hit this branch when - // the split trigger is applied, and eventually there's a - // ChangeReplicas that re-adds the local store under a new - // replicaID. - // - // So our trigger will have a smaller replicaID for our RHS, which - // will blow up in initRaftMuLockedReplicaMuLocked. We still want - // to force the RHS to accept the descriptor, even though that - // rewinds the replicaID. To do that we want to change the existing - // replicaID, but we didn't find one -- zero is then the only reasonable - // choice. Note that this is also the replicaID a replica that is - // not reflected in its own descriptor will have, i.e. we're cooking - // up less of a special case here than you'd expect at first glance. - // - // Note that futzing with the replicaID is a high-risk operation as - // it is what the raft peer will believe itself to be identified by. - // Under no circumstances must we use a replicaID that belongs to - // someone else, or a byzantine situation will result. Zero is - // special-cased and will never init a raft group until the real - // ID is known from inbound raft traffic. - rightDesc.ReplicaID = 0 // for clarity only; it's already zero - } - if rightRng.mu.replicaID > rightDesc.ReplicaID { - rightRng.mu.replicaID = rightDesc.ReplicaID - } - // NB: the safety argument above implies that we don't have to worry - // about restoring the existing minReplicaID if it's nonzero. No - // promises have been made, so none need to be kept. So we clear this - // unconditionally, making sure that it doesn't block us from init'ing - // the RHS. - rightRng.mu.minReplicaID = 0 + // Already holding raftMu, see above. + rightRng.mu.Lock() + + // If we know that the RHS has already been removed at this replica ID + // then we also know that its data has already been removed by the preApply + // so we continue to update the descriptor for the left hand side and + // return. + if rightRng.isNewerThanSplitRLocked(split) { + rightRng.mu.Unlock() + rightRng = nil + } else { + // Finish initialization of the RHS. err := rightRng.initRaftMuLockedReplicaMuLocked(&split.RightDesc, r.store.Clock(), 0) rightRng.mu.Unlock() if err != nil { log.Fatal(ctx, err) } - } - // Finish initialization of the RHS. - - // This initialMaxClosedValue is created here to ensure that follower reads - // do not regress following the split. After the split occurs there will be no - // information in the closedts subsystem about the newly minted RHS range from - // its leaseholder's store. Furthermore, the RHS will have a lease start time - // equal to that of the LHS which might be quite old. This means that - // timestamps which follow the least StartTime for the LHS part are below the - // current closed timestamp for the LHS would no longer be readable on the RHS - // after the split. It is critical that this call to maxClosed happen during - // the splitPostApply so that it refers to a LAI that is equal to the index at - // which this lease was applied. If it were to refer to a LAI after the split - // then the value of initialMaxClosed might be unsafe. - initialMaxClosed := r.maxClosed(ctx) - r.mu.Lock() - rightRng.mu.Lock() - // Copy the minLeaseProposedTS from the LHS. - rightRng.mu.minLeaseProposedTS = r.mu.minLeaseProposedTS - rightRng.mu.initialMaxClosed = initialMaxClosed - rightLease := *rightRng.mu.state.Lease - rightRng.mu.Unlock() - r.mu.Unlock() - - // We need to explicitly wake up the Raft group on the right-hand range or - // else the range could be underreplicated for an indefinite period of time. - // - // Specifically, suppose one of the replicas of the left-hand range never - // applies this split trigger, e.g., because it catches up via a snapshot that - // advances it past this split. That store won't create the right-hand replica - // until it receives a Raft message addressed to the right-hand range. But - // since new replicas start out quiesced, unless we explicitly awaken the - // Raft group, there might not be any Raft traffic for quite a while. - if err := rightRng.withRaftGroup(true, func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error) { - return true, nil - }); err != nil { - log.Fatalf(ctx, "unable to create raft group for right-hand range in split: %+v", err) - } + // This initialMaxClosedValue is created here to ensure that follower reads + // do not regress following the split. After the split occurs there will be no + // information in the closedts subsystem about the newly minted RHS range from + // its leaseholder's store. Furthermore, the RHS will have a lease start time + // equal to that of the LHS which might be quite old. This means that + // timestamps which follow the least StartTime for the LHS part are below the + // current closed timestamp for the LHS would no longer be readable on the RHS + // after the split. It is critical that this call to maxClosed happen during + // the splitPostApply so that it refers to a LAI that is equal to the index at + // which this lease was applied. If it were to refer to a LAI after the split + // then the value of initialMaxClosed might be unsafe. + initialMaxClosed := r.maxClosed(ctx) + r.mu.Lock() + rightRng.mu.Lock() + // Copy the minLeaseProposedTS from the LHS. + rightRng.mu.minLeaseProposedTS = r.mu.minLeaseProposedTS + rightRng.mu.initialMaxClosed = initialMaxClosed + rightLease := *rightRng.mu.state.Lease + rightRng.mu.Unlock() + r.mu.Unlock() - // Invoke the leasePostApply method to ensure we properly initialize - // the replica according to whether it holds the lease. This enables - // the txnWaitQueue. - rightRng.leasePostApply(ctx, rightLease, false /* permitJump */) + // We need to explicitly wake up the Raft group on the right-hand range or + // else the range could be underreplicated for an indefinite period of time. + // + // Specifically, suppose one of the replicas of the left-hand range never + // applies this split trigger, e.g., because it catches up via a snapshot that + // advances it past this split. That store won't create the right-hand replica + // until it receives a Raft message addressed to the right-hand range. But + // since new replicas start out quiesced, unless we explicitly awaken the + // Raft group, there might not be any Raft traffic for quite a while. + err = rightRng.withRaftGroup(true, func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error) { + return true, nil + }) + if err != nil { + log.Fatalf(ctx, "unable to create raft group for right-hand range in split: %+v", err) + } + // Invoke the leasePostApply method to ensure we properly initialize + // the replica according to whether it holds the lease. This enables + // the txnWaitQueue. + rightRng.leasePostApply(ctx, rightLease, false /* permitJump */) + } // Add the RHS replica to the store. This step atomically updates // the EndKey of the LHS replica and also adds the RHS replica // to the store's replica map. - if err := r.store.SplitRange(ctx, r, rightRng, split.LeftDesc); err != nil { + if err := r.store.SplitRange(ctx, r, rightRng, split); err != nil { // Our in-memory state has diverged from the on-disk state. log.Fatalf(ctx, "%s: failed to update Store after split: %+v", r, err) } @@ -2265,22 +2251,24 @@ func splitPostApply( // might require the range to be split again. Enqueue both the left and right // ranges to speed up such splits. See #10160. r.store.splitQueue.MaybeAddAsync(ctx, r, now) - r.store.splitQueue.MaybeAddAsync(ctx, rightRng, now) - // If the range was not properly replicated before the split, the replicate // queue may not have picked it up (due to the need for a split). Enqueue // both the left and right ranges to speed up a potentially necessary // replication. See #7022 and #7800. r.store.replicateQueue.MaybeAddAsync(ctx, r, now) - r.store.replicateQueue.MaybeAddAsync(ctx, rightRng, now) - if len(split.RightDesc.Replicas().All()) == 1 { - // TODO(peter): In single-node clusters, we enqueue the right-hand side of - // the split (the new range) for Raft processing so that the corresponding - // Raft group is created. This shouldn't be necessary for correctness, but - // some tests rely on this (e.g. server.TestNodeStatusWritten). - r.store.enqueueRaftUpdateCheck(rightRng.RangeID) + if rightRng != nil { + r.store.splitQueue.MaybeAddAsync(ctx, rightRng, now) + r.store.replicateQueue.MaybeAddAsync(ctx, rightRng, now) + if len(split.RightDesc.Replicas().All()) == 1 { + // TODO(peter): In single-node clusters, we enqueue the right-hand side of + // the split (the new range) for Raft processing so that the corresponding + // Raft group is created. This shouldn't be necessary for correctness, but + // some tests rely on this (e.g. server.TestNodeStatusWritten). + r.store.enqueueRaftUpdateCheck(rightRng.RangeID) + } } + } // SplitRange shortens the original range to accommodate the new range. The new @@ -2288,13 +2276,14 @@ func splitPostApply( // and newRng.raftMu must be held. // // This is only called from the split trigger in the context of the execution -// of a Raft command. +// of a Raft command. Note that rightRepl will be nil if the replica described +// by rightDesc is known to have been removed. func (s *Store) SplitRange( - ctx context.Context, leftRepl, rightRepl *Replica, newLeftDesc roachpb.RangeDescriptor, + ctx context.Context, leftRepl, rightRepl *Replica, split *roachpb.SplitTrigger, ) error { + rightDesc := &split.RightDesc + newLeftDesc := &split.LeftDesc oldLeftDesc := leftRepl.Desc() - rightDesc := rightRepl.Desc() - if !bytes.Equal(oldLeftDesc.EndKey, rightDesc.EndKey) || bytes.Compare(oldLeftDesc.StartKey, rightDesc.StartKey) >= 0 { return errors.Errorf("left range is not splittable by right range: %+v, %+v", oldLeftDesc, rightDesc) @@ -2302,7 +2291,7 @@ func (s *Store) SplitRange( s.mu.Lock() defer s.mu.Unlock() - if exRng, ok := s.mu.uninitReplicas[rightDesc.RangeID]; ok { + if exRng, ok := s.mu.uninitReplicas[rightDesc.RangeID]; rightRepl != nil && ok { // If we have an uninitialized replica of the new range we require pointer // equivalence with rightRepl. See Store.splitTriggerPostApply(). if exRng != rightRepl { @@ -2315,7 +2304,7 @@ func (s *Store) SplitRange( s.replicaQueues.Delete(int64(rightDesc.RangeID)) } - leftRepl.setDesc(ctx, &newLeftDesc) + leftRepl.setDesc(ctx, newLeftDesc) // Clear the LHS txn wait queue, to redirect to the RHS if // appropriate. We do this after setDescWithoutProcessUpdate @@ -2336,22 +2325,27 @@ func (s *Store) SplitRange( // Clear the original range's request stats, since they include requests for // spans that are now owned by the new range. leftRepl.leaseholderStats.resetRequestCounts() - leftRepl.writeStats.splitRequestCounts(rightRepl.writeStats) - if err := s.addReplicaInternalLocked(rightRepl); err != nil { - return errors.Errorf("unable to add replica %v: %s", rightRepl, err) - } + if rightRepl == nil { + throwawayRightWriteStats := new(replicaStats) + leftRepl.writeStats.splitRequestCounts(throwawayRightWriteStats) + } else { + leftRepl.writeStats.splitRequestCounts(rightRepl.writeStats) + if err := s.addReplicaInternalLocked(rightRepl); err != nil { + return errors.Errorf("unable to add replica %v: %s", rightRepl, err) + } - // Update the replica's cached byte thresholds. This is a no-op if the system - // config is not available, in which case we rely on the next gossip update - // to perform the update. - if err := rightRepl.updateRangeInfo(rightRepl.Desc()); err != nil { - return err + // Update the replica's cached byte thresholds. This is a no-op if the system + // config is not available, in which case we rely on the next gossip update + // to perform the update. + if err := rightRepl.updateRangeInfo(rightRepl.Desc()); err != nil { + return err + } + // Add the range to metrics and maybe gossip on capacity change. + s.metrics.ReplicaCount.Inc(1) + s.maybeGossipOnCapacityChange(ctx, rangeAddEvent) } - // Add the range to metrics and maybe gossip on capacity change. - s.metrics.ReplicaCount.Inc(1) - s.maybeGossipOnCapacityChange(ctx, rangeAddEvent) return nil } @@ -2406,7 +2400,7 @@ func (s *Store) MergeRange( // Note that we were called (indirectly) from raft processing so we must // call removeReplicaImpl directly to avoid deadlocking on the right-hand // replica's raftMu. - if err := s.removeReplicaImpl(ctx, rightRepl, rightDesc.NextReplicaID, RemoveOptions{ + if err := s.removeInitializedReplicaRaftMuLocked(ctx, rightRepl, rightDesc.NextReplicaID, RemoveOptions{ DestroyData: false, // the replica was destroyed when the merge commit applied }); err != nil { return errors.Errorf("cannot remove range: %s", err) @@ -2562,25 +2556,48 @@ type RemoveOptions struct { // advanced to or beyond the NextReplicaID since the removal decision was made. // // If opts.DestroyReplica is false, replica.destroyRaftMuLocked is not called. +// +// The passed replica must be initialized. func (s *Store) RemoveReplica( ctx context.Context, rep *Replica, nextReplicaID roachpb.ReplicaID, opts RemoveOptions, ) error { rep.raftMu.Lock() defer rep.raftMu.Unlock() - return s.removeReplicaImpl(ctx, rep, nextReplicaID, opts) + return s.removeInitializedReplicaRaftMuLocked(ctx, rep, nextReplicaID, opts) +} + +// removeReplicaRaftMuLocked removes the passed replica. If the replica is +// initialized the RemoveOptions will be consulted. +func (s *Store) removeReplicaRaftMuLocked( + ctx context.Context, rep *Replica, nextReplicaID roachpb.ReplicaID, opts RemoveOptions, +) (err error) { + rep.raftMu.AssertHeld() + if !rep.IsInitialized() { + err = errors.Wrap(s.removeUninitializedReplicaRaftMuLocked(ctx, rep, nextReplicaID), + "failed to remove uninitialized replica") + } else { + err = errors.Wrap(s.removeInitializedReplicaRaftMuLocked(ctx, rep, nextReplicaID, opts), + "failed to remove replica") + } + return err } // removeReplicaImpl is the implementation of RemoveReplica, which is sometimes // called directly when the necessary lock is already held. It requires that // Replica.raftMu is held and that s.mu is not held. -func (s *Store) removeReplicaImpl( +func (s *Store) removeInitializedReplicaRaftMuLocked( ctx context.Context, rep *Replica, nextReplicaID roachpb.ReplicaID, opts RemoveOptions, ) error { rep.raftMu.AssertHeld() // We check both rep.mu.ReplicaID and rep.mu.state.Desc's replica ID because // they can differ in cases when a replica's ID is increased due to an // incoming raft message (see #14231 for background). + // TODO(ajwerner): reconsider some of this sanity checking. rep.mu.Lock() + if rep.mu.destroyStatus.Removed() { + rep.mu.Unlock() + return nil + } replicaID := rep.mu.replicaID if rep.mu.replicaID >= nextReplicaID { rep.mu.Unlock() @@ -2600,12 +2617,7 @@ func (s *Store) removeReplicaImpl( } if !rep.IsInitialized() { - // The split trigger relies on the fact that it can bypass the tombstone - // check for the RHS, but this is only true as long as we never delete - // its HardState. - // - // See the comment in splitPostApply for details. - log.Fatalf(ctx, "can not replicaGC uninitialized replicas") + log.Fatalf(ctx, "can not replicaGC uninitialized replicas in this method") } // During merges, the context might have the subsuming range, so we explicitly @@ -2666,7 +2678,73 @@ func (s *Store) removeReplicaImpl( // TODO(peter): Could release s.mu.Lock() here. s.maybeGossipOnCapacityChange(ctx, rangeRemoveEvent) s.scanner.RemoveReplica(rep) + return nil +} + +func (s *Store) removeUninitializedReplicaRaftMuLocked( + ctx context.Context, rep *Replica, nextReplicaID roachpb.ReplicaID, +) error { + rep.raftMu.AssertHeld() + // Sanity check this removal. + rep.mu.RLock() + ds := rep.mu.destroyStatus + isInitialized := rep.isInitializedRLocked() + rep.mu.RUnlock() + // Somebody already removed this Replica. + if ds.Removed() { + return nil + } + if ds.reason != destroyReasonRemovalPending { + log.Fatalf(ctx, "cannot remove uninitialized replica which is not removal pending: %v", ds) + } + + // When we're in this state we should have already had our destroy status set + // so it shouldn't have been possible to process any raft messages or apply + // any snapshots. + if isInitialized { + log.Fatalf(ctx, "previously uninitialized replica became initialized before removal") + } + + // Proceed with the removal. + rep.readOnlyCmdMu.Lock() + rep.mu.Lock() + rep.cancelPendingCommandsLocked() + rep.mu.internalRaftGroup = nil + rep.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(rep.RangeID, rep.store.StoreID()), destroyReasonRemoved) + rep.mu.Unlock() + rep.readOnlyCmdMu.Unlock() + + if err := rep.destroyRaftMuLocked(ctx, nextReplicaID); err != nil { + log.Fatalf(ctx, "failed to remove uninitialized replica %v: %v", rep, err) + } + + s.mu.Lock() + defer s.mu.Unlock() + + // Sanity check, could be removed. + value, stillExists := s.mu.replicas.Load(int64(rep.RangeID)) + if !stillExists { + log.Fatalf(ctx, "uninitialized replica was removed in the meantime") + } + existing := (*Replica)(value) + if existing == rep { + log.Infof(ctx, "removing uninitialized replica %v", rep) + } else { + log.Fatalf(ctx, "uninitialized replica %v was unexpectedly replaced", existing) + } + + s.metrics.ReplicaCount.Dec(1) + + // Only an uninitialized replica can have a placeholder since, by + // definition, an initialized replica will be present in the + // replicasByKey map. While the replica will usually consume the + // placeholder itself, that isn't guaranteed and so this invocation + // here is crucial (i.e. don't remove it). + if s.removePlaceholderLocked(ctx, rep.RangeID) { + atomic.AddInt32(&s.counts.droppedPlaceholders, 1) + } + s.unlinkReplicaByRangeIDLocked(rep.RangeID) return nil } @@ -3504,10 +3582,8 @@ func (s *Store) processRaftSnapshotRequest( if err := r.stepRaftGroup(&snapHeader.RaftMessageRequest); err != nil { return roachpb.NewError(err) } - - if _, expl, err := r.handleRaftReadyRaftMuLocked(ctx, inSnap); err != nil { - fatalOnRaftReadyErr(ctx, expl, err) - } + _, expl, err := r.handleRaftReadyRaftMuLocked(ctx, inSnap) + maybeFatalOnRaftReadyErr(ctx, expl, err) removePlaceholder = false return nil }) @@ -3543,33 +3619,41 @@ func (s *Store) HandleRaftResponse(ctx context.Context, resp *RaftMessageRespons repl.raftMu.Lock() defer repl.raftMu.Unlock() repl.mu.Lock() - defer repl.mu.Unlock() // If the replica ID in the error does not match then we know // that the replica has been removed and re-added quickly. In // that case, we don't want to add it to the replicaGCQueue. - if tErr.ReplicaID != repl.mu.replicaID { - log.Infof(ctx, "replica too old response with old replica ID: %s", tErr.ReplicaID) + // If the replica is not alive then we also should ignore this error. + if tErr.ReplicaID != repl.mu.replicaID || + !repl.mu.destroyStatus.IsAlive() || + // Ignore if we want to test the replicaGC queue. + s.TestingKnobs().DisableEagerReplicaRemoval { + repl.mu.Unlock() return nil } - // If the replica ID in the error does match, we know the replica - // will be removed and we can cancel any pending commands. This is - // sometimes necessary to unblock PushTxn operations that are - // necessary for the replica GC to succeed. - repl.cancelPendingCommandsLocked() - // The replica will be garbage collected soon (we are sure // since our replicaID is definitely too old), but in the meantime we // already want to bounce all traffic from it. Note that the replica - // could be re-added with a higher replicaID, in which this error is - // cleared in setReplicaIDRaftMuLockedMuLocked. - if repl.mu.destroyStatus.IsAlive() { - storeID := repl.store.StoreID() - repl.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(repl.RangeID, storeID), destroyReasonRemovalPending) + // could be re-added with a higher replicaID, but we want to clear the + // replica's data before that happens. + if log.V(1) { + log.Infof(ctx, "setting local replica to destroyed due to ReplicaTooOld error") } - s.replicaGCQueue.AddAsync(ctx, repl, replicaGCPriorityRemoved) + storeID := repl.store.StoreID() + // NB: This response may imply that this range has been added back to + // this store at a higher replica ID. We set RangeNotFound error despite + // the fact that this newer replica may have already been created + // because we don't have a more specific error. In general nobody should + // see it. + repl.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(repl.RangeID, storeID), + destroyReasonRemovalPending) + repl.mu.Unlock() + nextReplicaID := tErr.ReplicaID + 1 + return s.removeReplicaRaftMuLocked(ctx, repl, nextReplicaID, RemoveOptions{ + DestroyData: true, + }) case *roachpb.RaftGroupDeletedError: if replErr != nil { // RangeNotFoundErrors are expected here; nothing else is. @@ -3632,9 +3716,8 @@ func (s *Store) processRequestQueue(ctx context.Context, rangeID roachpb.RangeID // giving up the lock. Set lastRepl to nil, so we don't handle it // down below as well. lastRepl = nil - if _, expl, err := r.handleRaftReadyRaftMuLocked(ctx, noSnap); err != nil { - fatalOnRaftReadyErr(ctx, expl, err) - } + _, expl, err := r.handleRaftReadyRaftMuLocked(ctx, noSnap) + maybeFatalOnRaftReadyErr(ctx, expl, err) } return pErr }) @@ -3677,9 +3760,8 @@ func (s *Store) processRequestQueue(ctx context.Context, rangeID roachpb.RangeID // handleRaftReadyRaftMuLocked) since racing to handle Raft Ready won't // have any undesirable results. ctx = lastRepl.AnnotateCtx(ctx) - if _, expl, err := lastRepl.handleRaftReady(ctx, noSnap); err != nil { - fatalOnRaftReadyErr(ctx, expl, err) - } + _, expl, err := lastRepl.handleRaftReady(ctx, noSnap) + maybeFatalOnRaftReadyErr(ctx, expl, err) } } @@ -3693,9 +3775,7 @@ func (s *Store) processReady(ctx context.Context, rangeID roachpb.RangeID) { ctx = r.AnnotateCtx(ctx) start := timeutil.Now() stats, expl, err := r.handleRaftReady(ctx, noSnap) - if err != nil { - log.Fatalf(ctx, "%s: %+v", log.Safe(expl), err) // TODO(bdarnell) - } + maybeFatalOnRaftReadyErr(ctx, expl, err) elapsed := timeutil.Since(start) s.metrics.RaftWorkingDurationNanos.Inc(elapsed.Nanoseconds()) // Warn if Raft processing took too long. We use the same duration as we @@ -3937,7 +4017,16 @@ func (s *Store) getOrCreateReplica( replicaID roachpb.ReplicaID, creatingReplica *roachpb.ReplicaDescriptor, ) (_ *Replica, created bool, _ error) { + // We need a retry loop as the replica we find in the map may be in the + // process of being removed or may need to be removed. + r := retry.Start(retry.Options{ + InitialBackoff: time.Microsecond, + // Set the backoff up to only a small amount to wait for data that + // might need to be cleared. + MaxBackoff: 10 * time.Millisecond, + }) for { + r.Next() r, created, err := s.tryGetOrCreateReplica( ctx, rangeID, @@ -3966,32 +4055,42 @@ func (s *Store) tryGetOrCreateReplica( replicaID roachpb.ReplicaID, creatingReplica *roachpb.ReplicaDescriptor, ) (_ *Replica, created bool, _ error) { + // NB: All of the below closures assume that both the raftMu and mu are held + // for the passed Replica. + // The common case: look up an existing (initialized) replica. if value, ok := s.mu.replicas.Load(int64(rangeID)); ok { repl := (*Replica)(value) - repl.raftMu.Lock() // not unlocked + repl.raftMu.Lock() // not unlocked on success repl.mu.Lock() defer repl.mu.Unlock() - - var replTooOldErr error - if creatingReplica != nil { - // Drop messages that come from a node that we believe was once a member of - // the group but has been removed. - desc := repl.mu.state.Desc - _, found := desc.GetReplicaDescriptorByID(creatingReplica.ReplicaID) - // It's not a current member of the group. Is it from the past? - if !found && creatingReplica.ReplicaID < desc.NextReplicaID { - replTooOldErr = roachpb.NewReplicaTooOldError(creatingReplica.ReplicaID) - } + if err := tryGetOrCreateHandleFromReplicaTooOld(ctx, s, repl, creatingReplica); err != nil { + repl.raftMu.Unlock() + return nil, false, err + } + if repl.mu.destroyStatus.RemovingOrRemoved() { + repl.raftMu.Unlock() + return nil, false, errRetry + } + if err := tryGetOrCreateHandleToReplicaTooOld(ctx, s, repl, replicaID); err != nil { + repl.raftMu.Unlock() + return nil, false, err } var err error - if replTooOldErr != nil { - err = replTooOldErr - } else if ds := repl.mu.destroyStatus; ds.reason == destroyReasonRemoved { - err = errRetry - } else { - err = repl.setReplicaIDRaftMuLockedMuLocked(replicaID) + if repl.mu.replicaID == 0 { + // This message is telling us about our replica ID. + // This is a common case when dealing with preemptive snapshots. + err = repl.setReplicaIDRaftMuLockedMuLocked(repl.AnnotateCtx(ctx), replicaID) + } else if replicaID != 0 && repl.mu.replicaID > replicaID { + // The sender is behind and is sending to an old replica. + // We could silently drop this message but this way we'll inform the + // sender that they may no longer exist. + err = roachpb.NewRangeNotFoundError(rangeID, s.StoreID()) + } else if replicaID != 0 && repl.mu.replicaID != replicaID { + // This case should have been caught by handleToReplicaTooOld. + log.Fatalf(ctx, "intended replica id %d unexpectedly does not match the current replica %v", + replicaID, repl) } if err != nil { repl.raftMu.Unlock() @@ -4028,7 +4127,7 @@ func (s *Store) tryGetOrCreateReplica( // replica even outside of raft processing. Have to do this after grabbing // Store.mu to maintain lock ordering invariant. repl.mu.Lock() - repl.mu.minReplicaID = tombstone.NextReplicaID + repl.mu.tombstoneReplicaID = tombstone.NextReplicaID // Add the range to range map, but not replicasByKey since the range's start // key is unknown. The range will be added to replicasByKey later when a // snapshot is applied. After unlocking Store.mu above, another goroutine @@ -4073,6 +4172,49 @@ func (s *Store) tryGetOrCreateReplica( return repl, true, nil } +// TODO(ajwerner): The below helper functions are not closures inside of +// tryGetOrCreate because the escape analysis in go1.12 is not advanced +// enough to prevent them from escaping and moving to the heap. + +// Drop messages that come from a node that we believe was once a member of +// the group but has been removed. Assumes that repl.mu and repl.raftMu are both +// held. +func tryGetOrCreateHandleFromReplicaTooOld( + ctx context.Context, s *Store, repl *Replica, creatingReplica *roachpb.ReplicaDescriptor, +) error { + if creatingReplica == nil { + return nil + } + desc := repl.mu.state.Desc + _, found := desc.GetReplicaDescriptorByID(creatingReplica.ReplicaID) + // It's not a current member of the group. Is it from the past? + if !found && creatingReplica.ReplicaID < desc.NextReplicaID { + return roachpb.NewReplicaTooOldError(creatingReplica.ReplicaID) + } + return nil +} + +func tryGetOrCreateHandleToReplicaTooOld( + ctx context.Context, s *Store, repl *Replica, replicaID roachpb.ReplicaID, +) error { + if replicaID == 0 || repl.mu.replicaID == 0 || repl.mu.replicaID >= replicaID { + return nil + } + if log.V(1) { + log.Infof(ctx, "found message for replica ID %d which is newer than %v", replicaID, repl) + } + repl.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(repl.RangeID, repl.StoreID()), + destroyReasonRemovalPending) + repl.mu.Unlock() + defer repl.mu.Lock() + if err := s.removeReplicaRaftMuLocked(ctx, repl, replicaID, RemoveOptions{ + DestroyData: true, + }); err != nil { + log.Fatal(ctx, err) + } + return errRetry +} + func (s *Store) updateCapacityGauges() error { desc, err := s.Descriptor(false /* useCached */) if err != nil { diff --git a/pkg/storage/store_snapshot.go b/pkg/storage/store_snapshot.go index 3fb7ff8b74d5..7c8062fb5250 100644 --- a/pkg/storage/store_snapshot.go +++ b/pkg/storage/store_snapshot.go @@ -628,7 +628,9 @@ func (s *Store) canApplySnapshotLocked( existingRepl.raftMu.AssertHeld() existingRepl.mu.RLock() - existingIsInitialized := existingRepl.isInitializedRLocked() + existingDesc := existingRepl.mu.state.Desc + existingIsInitialized := existingDesc.IsInitialized() + existingDestroyStatus := existingRepl.mu.destroyStatus existingRepl.mu.RUnlock() if existingIsInitialized { @@ -637,15 +639,19 @@ func (s *Store) canApplySnapshotLocked( // in Replica.maybeAcquireSnapshotMergeLock for how this is // made safe. // - // NB: we expect the replica to know its replicaID at this point - // (i.e. !existingIsPreemptive), though perhaps it's possible - // that this isn't true if the leader initiates a Raft snapshot - // (that would provide a range descriptor with this replica in - // it) but this node reboots (temporarily forgetting its - // replicaID) before the snapshot arrives. + // NB: The snapshot must be intended for this replica as + // withReplicaForRequest ensures that requests with a non-zero replica + // id are passed to a replica with a matching id. Given this is not a + // preemptive snapshot we know that its id must be non-zero. return nil, nil } + // If we are not alive then we should not apply a snapshot as our removal + // is imminent. + if existingDestroyStatus.RemovingOrRemoved() { + return nil, existingDestroyStatus.err + } + // We have a key range [desc.StartKey,desc.EndKey) which we want to apply a // snapshot for. Is there a conflicting existing placeholder or an // overlapping range? @@ -670,7 +676,7 @@ func (s *Store) checkSnapshotOverlapLocked( // NB: this check seems redundant since placeholders are also represented in // replicasByKey (and thus returned in getOverlappingKeyRangeLocked). if exRng, ok := s.mu.replicaPlaceholders[desc.RangeID]; ok { - return errors.Errorf("%s: canApplySnapshotLocked: cannot add placeholder, have an existing placeholder %s", s, exRng) + return errors.Errorf("%s: canApplySnapshotLocked: cannot add placeholder, have an existing placeholder %s %v", s, exRng, snapHeader.RaftMessageRequest.FromReplica) } // TODO(benesch): consider discovering and GC'ing *all* overlapping ranges, @@ -736,43 +742,16 @@ func (s *Store) shouldAcceptSnapshotData( if snapHeader.IsPreemptive() { return crdberrors.AssertionFailedf(`expected a raft or learner snapshot`) } - - s.mu.Lock() - defer s.mu.Unlock() - - // TODO(tbg): see the comment on desc.Generation for what seems to be a much - // saner way to handle overlap via generational semantics. - desc := *snapHeader.State.Desc - - // First, check for an existing Replica. - if v, ok := s.mu.replicas.Load( - int64(desc.RangeID), - ); ok { - existingRepl := (*Replica)(v) - existingRepl.mu.RLock() - existingIsInitialized := existingRepl.isInitializedRLocked() - existingRepl.mu.RUnlock() - - if existingIsInitialized { - // Regular Raft snapshots can't be refused at this point, - // even if they widen the existing replica. See the comments - // in Replica.maybeAcquireSnapshotMergeLock for how this is - // made safe. - // - // NB: we expect the replica to know its replicaID at this point - // (i.e. !existingIsPreemptive), though perhaps it's possible - // that this isn't true if the leader initiates a Raft snapshot - // (that would provide a range descriptor with this replica in - // it) but this node reboots (temporarily forgetting its - // replicaID) before the snapshot arrives. + pErr := s.withReplicaForRequest(ctx, &snapHeader.RaftMessageRequest, + func(ctx context.Context, r *Replica) *roachpb.Error { + if !r.IsInitialized() { + s.mu.Lock() + defer s.mu.Unlock() + return roachpb.NewError(s.checkSnapshotOverlapLocked(ctx, snapHeader)) + } return nil - } - } - - // We have a key range [desc.StartKey,desc.EndKey) which we want to apply a - // snapshot for. Is there a conflicting existing placeholder or an - // overlapping range? - return s.checkSnapshotOverlapLocked(ctx, snapHeader) + }) + return pErr.GoError() } // receiveSnapshot receives an incoming snapshot via a pre-opened GRPC stream. diff --git a/pkg/storage/store_snapshot_preemptive.go b/pkg/storage/store_snapshot_preemptive.go index 9c8893a93980..67d351719eea 100644 --- a/pkg/storage/store_snapshot_preemptive.go +++ b/pkg/storage/store_snapshot_preemptive.go @@ -330,8 +330,8 @@ func (s *Store) processPreemptiveSnapshotRequest( // Raft has decided the snapshot shouldn't be applied we would be // writing the tombstone key incorrectly. r.mu.Lock() - if r.mu.state.Desc.NextReplicaID > r.mu.minReplicaID { - r.mu.minReplicaID = r.mu.state.Desc.NextReplicaID + if r.mu.state.Desc.NextReplicaID > r.mu.tombstoneReplicaID { + r.mu.tombstoneReplicaID = r.mu.state.Desc.NextReplicaID } r.mu.Unlock() } diff --git a/pkg/storage/store_test.go b/pkg/storage/store_test.go index 8829fd7a15cb..ba431889d1c3 100644 --- a/pkg/storage/store_test.go +++ b/pkg/storage/store_test.go @@ -569,8 +569,8 @@ func TestStoreAddRemoveRanges(t *testing.T) { // Try to remove range 1 again. if err := store.RemoveReplica(context.Background(), repl1, repl1.Desc().NextReplicaID, RemoveOptions{ DestroyData: true, - }); err == nil { - t.Fatal("expected error re-removing same range") + }); err != nil { + t.Fatalf("didn't expect error re-removing same range: %v", err) } // Try to add a range with previously-used (but now removed) ID. repl2Dup := createReplica(store, 1, roachpb.RKey("a"), roachpb.RKey("b")) @@ -712,11 +712,10 @@ func TestStoreRemoveReplicaDestroy(t *testing.T) { // Verify that removal of a replica marks it as destroyed so that future raft // commands on the Replica will silently be dropped. - if err := repl1.withRaftGroup(true, func(r *raft.RawNode) (bool, error) { + err = repl1.withRaftGroup(true, func(r *raft.RawNode) (bool, error) { return true, errors.Errorf("unexpectedly created a raft group") - }); err != nil { - t.Fatal(err) - } + }) + require.Equal(t, errRemoved, err) repl1.mu.Lock() expErr := roachpb.NewError(repl1.mu.destroyStatus.err) @@ -1340,21 +1339,24 @@ func splitTestRange(store *Store, key, splitKey roachpb.RKey, t *testing.T) *Rep require.NotNil(t, repl) rangeID, err := store.AllocateRangeID(ctx) require.NoError(t, err) - desc := roachpb.NewRangeDescriptor( + rhsDesc := roachpb.NewRangeDescriptor( rangeID, splitKey, repl.Desc().EndKey, repl.Desc().Replicas()) // Minimal amount of work to keep this deprecated machinery working: Write // some required Raft keys. cv := store.ClusterSettings().Version.Version().Version _, err = stateloader.WriteInitialState( - context.Background(), store.engine, enginepb.MVCCStats{}, *desc, roachpb.Lease{}, + context.Background(), store.engine, enginepb.MVCCStats{}, *rhsDesc, roachpb.Lease{}, hlc.Timestamp{}, cv, stateloader.TruncatedStateUnreplicated, ) require.NoError(t, err) - newRng, err := NewReplica(desc, store, 0) + newRng, err := NewReplica(rhsDesc, store, 0) require.NoError(t, err) newLeftDesc := *repl.Desc() newLeftDesc.EndKey = splitKey - err = store.SplitRange(repl.AnnotateCtx(context.TODO()), repl, newRng, newLeftDesc) + err = store.SplitRange(repl.AnnotateCtx(context.TODO()), repl, newRng, &roachpb.SplitTrigger{ + RightDesc: *rhsDesc, + LeftDesc: newLeftDesc, + }) require.NoError(t, err) return newRng } @@ -2953,104 +2955,6 @@ func TestStoreRemovePlaceholderOnRaftIgnored(t *testing.T) { }) } -// Test that we set proper tombstones for removed replicas and use the -// tombstone to reject attempts to create a replica with a lesser ID. -func TestRemovedReplicaTombstone(t *testing.T) { - defer leaktest.AfterTest(t)() - - const rangeID = 1 - creatingReplica := roachpb.ReplicaDescriptor{ - NodeID: 2, - StoreID: 2, - ReplicaID: 2, - } - - // All test cases assume that the starting replica ID is 1. This assumption - // is enforced by a check within the test logic. - testCases := []struct { - setReplicaID roachpb.ReplicaID // set the existing replica to this before removing it - descNextReplicaID roachpb.ReplicaID // the descriptor's NextReplicaID during replica removal - createReplicaID roachpb.ReplicaID // try creating a replica at this ID - expectCreated bool - }{ - {1, 2, 2, true}, - {1, 2, 1, false}, - {1, 2, 1, false}, - {1, 3, 1, false}, - {1, 3, 2, false}, - {1, 3, 3, true}, - {1, 99, 98, false}, - {1, 99, 99, true}, - {2, 2, 2, false}, - {2, 2, 3, true}, - {2, 2, 99, true}, - {98, 2, 98, false}, - {98, 2, 99, true}, - } - for _, c := range testCases { - t.Run("", func(t *testing.T) { - tc := testContext{} - stopper := stop.NewStopper() - ctx := context.TODO() - defer stopper.Stop(ctx) - tc.Start(t, stopper) - s := tc.store - - repl1, err := s.GetReplica(rangeID) - if err != nil { - t.Fatal(err) - } - repl1.mu.Lock() - if repl1.mu.replicaID != 1 { - repl1.mu.Unlock() - t.Fatalf("test precondition not met; expected ReplicaID=1, got %d", repl1.mu.replicaID) - } - repl1.mu.Unlock() - - // Try to trigger a race where the replica ID gets increased during the GC - // process by taking the store lock and inserting a short sleep to cause - // the goroutine to start running the setReplicaID call. - errChan := make(chan error) - - func() { - repl1.raftMu.Lock() - defer repl1.raftMu.Unlock() - s.mu.Lock() - defer s.mu.Unlock() - repl1.mu.Lock() - defer repl1.mu.Unlock() - - go func() { - errChan <- s.RemoveReplica(ctx, repl1, c.descNextReplicaID, RemoveOptions{DestroyData: true}) - }() - - time.Sleep(1 * time.Millisecond) - - if err := repl1.setReplicaIDRaftMuLockedMuLocked(c.setReplicaID); err != nil { - t.Fatal(err) - } - }() - - if err := <-errChan; testutils.IsError(err, "replica ID has changed") { - // We didn't trigger the race, so just return success. - return - } else if err != nil { - t.Fatal(err) - } - - _, created, err := s.getOrCreateReplica(ctx, rangeID, c.createReplicaID, &creatingReplica) - if created != c.expectCreated { - t.Errorf("expected s.getOrCreateReplica(%d, %d, %v).created=%v, got %v", - rangeID, c.createReplicaID, creatingReplica, c.expectCreated, created) - } - if !c.expectCreated && !testutils.IsError(err, "raft group deleted") { - t.Errorf("expected s.getOrCreateReplica(%d, %d, %v).err='raft group deleted', got %v", - rangeID, c.createReplicaID, creatingReplica, err) - } - }) - } -} - type fakeSnapshotStream struct { nextResp *SnapshotResponse nextErr error diff --git a/pkg/storage/testing_knobs.go b/pkg/storage/testing_knobs.go index 7ee8dccf6953..14d1f31a24db 100644 --- a/pkg/storage/testing_knobs.go +++ b/pkg/storage/testing_knobs.go @@ -139,6 +139,13 @@ type StoreTestingKnobs struct { // DisableRefreshReasonTicks disables refreshing pending commands // periodically. DisableRefreshReasonTicks bool + // DisableEagerReplicaRemoval prevents the Replica from destroying itself + // when it encounters a ChangeReplicasTrigger which would remove it or when + // a ReplicaTooOldError in a RaftMessageResponse would lead to removal. + // This option can lead to nasty cases during shutdown where a replica will + // spin attempting to acquire a split or merge lock on a RHS which will + // always fail and is generally not safe but is useful for testing. + DisableEagerReplicaRemoval bool // RefreshReasonTicksPeriod overrides the default period over which // pending commands are refreshed. The period is specified as a multiple // of Raft group ticks.