storage: ensure Replica objects never change replicaID

We've seen instability recently due to invariants being violated as replicas catch up across periods of being removed and re-added to a range. Due to learner replicas and their rollback behavior this is now a relatively common case. Rather than handle all of these various scenarios this PR prevents them from occuring by actively removing replicas when we determine that they must have been removed. Here's a high level overview of the change: * Once a Replica object has a non-zero Replica.mu.replicaID it will not change. * In this commit however, if a node crashes it may forget that it learned about a replica ID. * If a raft message or snapshot addressed to a higher replica ID is received the current replica will be removed completely. * If a replica sees a ChangeReplicasTrigger which removes it then it completely removes itself while applying that command. * Replica.mu.destroyStatus is used to meaningfully signify the removal state of a Replica. Replicas about to be synchronously removed are in destroyReasonRemovalPending. This hopefully gives us some new invariants: * There is only ever at most 1 Replica which IsAlive() for a range on a Store at a time. * Once a Replica has a non-zero ReplicaID is never changes. * This applies only to the in-memory object, not the store itself. * Once a Replica applies a command as a part of the range descriptor it will never apply another command as a different Replica ID or outside of the Range. * Corrolary: a Replica created as a learner will only ever apply commands while that replica is in the range. The change also introduces some new complexity. Namely we now allow removal of uninitialized replicas, including their hard state. This allows us to catch up across a split even when we know the RHS must have been removed. Fixes cockroachdb#40367. Issue cockroachdb#38772 (comment) manifests itself as the RHS not being found for a merge. This happens because the Replica is processing commands to catch itself up while it is not in the range. This is no longer possible. Fixes cockroachdb#40257. Issue cockroachdb#40257 is another case of a replica processing commands while it is not in the range. Fixes cockroachdb#40470. Issue cockroachdb#40470 is caused by a RHS learning about its existence and removal prior to a LHS processing a split. This case is now handled properly and is tested. Release justification: This commit is safe for 19.2 because it fixes release blockers. Release note (bug fix): Avoid internal re-use of Replica objects to fix the following crashes: cockroachdb#38772 "found rXXX:{-} [, next=0, gen=0?] in place of the RHS" cockroachdb#39796 "replica descriptor of local store not found in right hand side of split" cockroachdb#40470 "split trigger found right-hand side with tombstone" cockroachdb#40257 "snapshot widens existing replica, but no replica exists for subsumed key"
ajwerner · Sep 23, 2019 · d139c0d · d139c0d
1 parent 4853ae2
commit d139c0d
Show file tree

Hide file tree

Showing 25 changed files with 1,930 additions and 779 deletions.
diff --git a/pkg/storage/apply/task.go b/pkg/storage/apply/task.go
@@ -12,6 +12,7 @@ package apply
 
 import (
 	"context"
+	"errors"
 
 	"go.etcd.io/etcd/raft/raftpb"
 )
@@ -54,6 +55,10 @@ type StateMachine interface {
 	ApplySideEffects(CheckedCommand) (AppliedCommand, error)
 }
 
+// ErrRemoved can be returned from ApplySideEffects which will stop the
+// task from processing more commands and return immediately.
+var ErrRemoved = errors.New("replica removed")
+
 // Batch accumulates a series of updates from Commands and performs them
 // all at once to its StateMachine when applied. Groups of Commands will be
 // staged in the Batch such that one or more trivial Commands are staged or

diff --git a/pkg/storage/client_merge_test.go b/pkg/storage/client_merge_test.go
@@ -57,7 +57,6 @@ import (
 	"github.com/pkg/errors"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-	"go.etcd.io/etcd/raft"
 	"go.etcd.io/etcd/raft/raftpb"
 )
 
@@ -1641,6 +1640,7 @@ func TestStoreReplicaGCAfterMerge(t *testing.T) {
 	storeCfg.TestingKnobs.DisableReplicateQueue = true
 	storeCfg.TestingKnobs.DisableReplicaGCQueue = true
 	storeCfg.TestingKnobs.DisableMergeQueue = true
+	storeCfg.TestingKnobs.DisableEagerReplicaRemoval = true
 	mtc := &multiTestContext{storeConfig: &storeCfg}
 	mtc.Start(t, 2)
 	defer mtc.Stop()
@@ -2043,8 +2043,20 @@ func TestStoreRangeMergeSlowAbandonedFollower(t *testing.T) {
 	lhsRepl2.RaftUnlock()
 
 	// Ensure that the unblocked merge eventually applies and subsumes the RHS.
+	// In general this will happen due to receiving a ReplicaTooOldError but
+	// it may require the replica GC queue. In rare cases the LHS will never
+	// hear about the merge and may need to be GC'd on its own.
 	testutils.SucceedsSoon(t, func() error {
-		if _, err := store2.GetReplica(rhsDesc.RangeID); err == nil {
+		// Make the the LHS gets destroyed.
+		if lhsRepl, err := store2.GetReplica(lhsDesc.RangeID); err == nil {
+			if err := store2.ManualReplicaGC(lhsRepl); err != nil {
+				t.Fatal(err)
+			}
+		}
+		if rhsRepl, err := store2.GetReplica(rhsDesc.RangeID); err == nil {
+			if err := store2.ManualReplicaGC(rhsRepl); err != nil {
+				t.Fatal(err)
+			}
 			return errors.New("rhs not yet destroyed")
 		}
 		return nil
@@ -2060,6 +2072,7 @@ func TestStoreRangeMergeAbandonedFollowers(t *testing.T) {
 	storeCfg.TestingKnobs.DisableReplicaGCQueue = true
 	storeCfg.TestingKnobs.DisableSplitQueue = true
 	storeCfg.TestingKnobs.DisableMergeQueue = true
+	storeCfg.TestingKnobs.DisableEagerReplicaRemoval = true
 	mtc := &multiTestContext{storeConfig: &storeCfg}
 	mtc.Start(t, 3)
 	defer mtc.Stop()
@@ -2827,74 +2840,6 @@ func TestStoreRangeMergeSlowWatcher(t *testing.T) {
 	}
 }
 
-// unreliableRaftHandler drops all Raft messages that are addressed to the
-// specified rangeID, but lets all other messages through.
-type unreliableRaftHandler struct {
-	rangeID roachpb.RangeID
-	storage.RaftMessageHandler
-	// If non-nil, can return false to avoid dropping a msg to rangeID
-	dropReq  func(*storage.RaftMessageRequest) bool
-	dropHB   func(*storage.RaftHeartbeat) bool
-	dropResp func(*storage.RaftMessageResponse) bool
-}
-
-func (h *unreliableRaftHandler) HandleRaftRequest(
-	ctx context.Context,
-	req *storage.RaftMessageRequest,
-	respStream storage.RaftMessageResponseStream,
-) *roachpb.Error {
-	if len(req.Heartbeats)+len(req.HeartbeatResps) > 0 {
-		reqCpy := *req
-		req = &reqCpy
-		req.Heartbeats = h.filterHeartbeats(req.Heartbeats)
-		req.HeartbeatResps = h.filterHeartbeats(req.HeartbeatResps)
-		if len(req.Heartbeats)+len(req.HeartbeatResps) == 0 {
-			// Entirely filtered.
-			return nil
-		}
-	} else if req.RangeID == h.rangeID {
-		if h.dropReq == nil || h.dropReq(req) {
-			log.Infof(
-				ctx,
-				"dropping Raft message %s",
-				raft.DescribeMessage(req.Message, func([]byte) string {
-					return "<omitted>"
-				}),
-			)
-
-			return nil
-		}
-	}
-	return h.RaftMessageHandler.HandleRaftRequest(ctx, req, respStream)
-}
-
-func (h *unreliableRaftHandler) filterHeartbeats(
-	hbs []storage.RaftHeartbeat,
-) []storage.RaftHeartbeat {
-	if len(hbs) == 0 {
-		return hbs
-	}
-	var cpy []storage.RaftHeartbeat
-	for i := range hbs {
-		hb := &hbs[i]
-		if hb.RangeID != h.rangeID || (h.dropHB != nil && !h.dropHB(hb)) {
-			cpy = append(cpy, *hb)
-		}
-	}
-	return cpy
-}
-
-func (h *unreliableRaftHandler) HandleRaftResponse(
-	ctx context.Context, resp *storage.RaftMessageResponse,
-) error {
-	if resp.RangeID == h.rangeID {
-		if h.dropResp == nil || h.dropResp(resp) {
-			return nil
-		}
-	}
-	return h.RaftMessageHandler.HandleRaftResponse(ctx, resp)
-}
-
 func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 
@@ -3082,20 +3027,22 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
 	mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{
 		rangeID:            aRepl0.RangeID,
 		RaftMessageHandler: store2,
-		dropReq: func(req *storage.RaftMessageRequest) bool {
-			// Make sure that even going forward no MsgApp for what we just
-			// truncated can make it through. The Raft transport is asynchronous
-			// so this is necessary to make the test pass reliably - otherwise
-			// the follower on store2 may catch up without needing a snapshot,
-			// tripping up the test.
-			//
-			// NB: the Index on the message is the log index that _precedes_ any of the
-			// entries in the MsgApp, so filter where msg.Index < index, not <= index.
-			return req.Message.Type == raftpb.MsgApp && req.Message.Index < index
+		dropFuncs: dropFuncs{
+			dropReq: func(req *storage.RaftMessageRequest) bool {
+				// Make sure that even going forward no MsgApp for what we just
+				// truncated can make it through. The Raft transport is asynchronous
+				// so this is necessary to make the test pass reliably - otherwise
+				// the follower on store2 may catch up without needing a snapshot,
+				// tripping up the test.
+				//
+				// NB: the Index on the message is the log index that _precedes_ any of the
+				// entries in the MsgApp, so filter where msg.Index < index, not <= index.
+				return req.Message.Type == raftpb.MsgApp && req.Message.Index < index
+			},
+			// Don't drop heartbeats or responses.
+			dropHB:   func(*storage.RaftHeartbeat) bool { return false },
+			dropResp: func(*storage.RaftMessageResponse) bool { return false },
 		},
-		// Don't drop heartbeats or responses.
-		dropHB:   func(*storage.RaftHeartbeat) bool { return false },
-		dropResp: func(*storage.RaftMessageResponse) bool { return false },
 	})
 
 	// Wait for all replicas to catch up to the same point. Because we truncated
@@ -3372,9 +3319,12 @@ func TestMergeQueue(t *testing.T) {
 	t.Run("non-collocated", func(t *testing.T) {
 		reset(t)
 		verifyUnmerged(t)
-		mtc.replicateRange(rhs().RangeID, 1)
-		mtc.transferLease(ctx, rhs().RangeID, 0, 1)
-		mtc.unreplicateRange(rhs().RangeID, 0)
+		rhsRangeID := rhs().RangeID
+		mtc.replicateRange(rhsRangeID, 1)
+		mtc.transferLease(ctx, rhsRangeID, 0, 1)
+		mtc.unreplicateRange(rhsRangeID, 0)
+		require.NoError(t, mtc.waitForUnreplicated(rhsRangeID, 0))
+
 		clearRange(t, lhsStartKey, rhsEndKey)
 		store.MustForceMergeScanAndProcess()
 		verifyMerged(t)

diff --git a/pkg/storage/client_metrics_test.go b/pkg/storage/client_metrics_test.go
@@ -26,6 +26,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
 	"github.com/cockroachdb/cockroach/pkg/util/metric"
 	"github.com/pkg/errors"
+	"github.com/stretchr/testify/require"
 )
 
 func checkGauge(t *testing.T, id string, g *metric.Gauge, e int64) {
@@ -313,8 +314,9 @@ func TestStoreMetrics(t *testing.T) {
 		return mtc.unreplicateRangeNonFatal(replica.RangeID, 0)
 	})
 
-	// Force GC Scan on store 0 in order to fully remove range.
-	mtc.stores[1].MustForceReplicaGCScanAndProcess()
+	// Wait until we're sure that store 0 has successfully processed its removal.
+	require.NoError(t, mtc.waitForUnreplicated(replica.RangeID, 0))
+
 	mtc.waitForValues(roachpb.Key("z"), []int64{0, 5, 5})
 
 	// Verify range count is as expected.