kv: set destroy status before destroying data on subsumed replicas

This PR cleans up some handling around replica destruction that scared me when working on #46329 and #55293. Specifically, there were cases during merges where the destroy status on a replica would not be set until after that replicas data was destroyed. This was true of merges applied through entry application, although in those cases, no user data was removed so it seems mostly harmless. More concerning is that is was true of merges applied through snapshot application. This was extra concerning because snapshot application can delete user data if a subsumed range is only partially covered (see `clearSubsumedReplicaDiskData`). So we were hypothetically risking user-visible correctness problems, especially now that we allow follower reads through on subsumed ranges as of #51594. This PR patches up these issues and then adds a few extra assertions that enforce stricter preconditions for the functions at play during replica destruction. Specifically, we now assert that the destroy status is set to `destroyReasonRemoved` _before_ calling `preDestroyRaftMuLocked`. We also assert that if `RemoveOptions.DestroyData` is false when passed to `removeInitializedReplicaRaftMuLocked`, then the destroy status is also set. This unification allows us to remove the `ignoreDestroyStatus` flag on `RemoveOptions`, whose meaning is now exactly the inverse of `DestroyData`. In hindsight, pushing on why this extra flag was needed and what new information it was conveying to the function could have potentially caught these issues a little earlier. I think we'll want to backport this to v20.2, though probably in the first patch release because there is some risk here (especially without sufficient time to bake on master) and we aren't aware of seeing any correctness issues from the combination of the bug fixed here and #51594. Release note (bug fix): A hypothesized bug that could allow a follower read to miss data on a range in the middle of being merged away into its left-hand neighbor was fixed. The bug seems exceedingly unlikely to have materialized in practice.
cockroachdb · Oct 14, 2020 · 8142440 · 8142440
1 parent ed3010d
commit 8142440
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 23 deletions.
diff --git a/pkg/kv/kvserver/replica_application_result.go b/pkg/kv/kvserver/replica_application_result.go
@@ -338,10 +338,6 @@ func (r *Replica) handleChangeReplicasResult(
 	if err := r.store.removeInitializedReplicaRaftMuLocked(ctx, r, chng.NextReplicaID(), RemoveOptions{
 		// We destroyed the data when the batch committed so don't destroy it again.
 		DestroyData: false,
-		// In order to detect the GC queue racing with other causes of replica removal
-		// the store will no-op when removing a replica which is already marked as removed
-		// unless we set ignoreDestroyStatus to true.
-		ignoreDestroyStatus: true,
 	}); err != nil {
 		log.Fatalf(ctx, "failed to remove replica: %v", err)
 	}

diff --git a/pkg/kv/kvserver/replica_application_state_machine.go b/pkg/kv/kvserver/replica_application_state_machine.go
@@ -652,6 +652,15 @@ func (b *replicaAppBatch) runPreApplyTriggersAfterStagingWriteBatch(
 		// its unlock method in cmd.splitMergeUnlock.
 		rhsRepl.raftMu.AssertHeld()
 
+		// We mark the replica as destroyed so that new commands are not
+		// accepted. This destroy status will be detected after the batch
+		// commits by handleMergeResult() to finish the removal.
+		rhsRepl.mu.Lock()
+		rhsRepl.mu.destroyStatus.Set(
+			roachpb.NewRangeNotFoundError(rhsRepl.RangeID, rhsRepl.store.StoreID()),
+			destroyReasonRemoved)
+		rhsRepl.mu.Unlock()
+
 		// Use math.MaxInt32 (mergedTombstoneReplicaID) as the nextReplicaID as an
 		// extra safeguard against creating new replicas of the RHS. This isn't
 		// required for correctness, since the merge protocol should guarantee that
@@ -726,11 +735,11 @@ func (b *replicaAppBatch) runPreApplyTriggersAfterStagingWriteBatch(
 		!b.r.store.TestingKnobs().DisableEagerReplicaRemoval {
 
 		// We mark the replica as destroyed so that new commands are not
-		// accepted. This destroy status will be detected after the batch commits
-		// by Replica.handleChangeReplicasTrigger() to finish the removal.
+		// accepted. This destroy status will be detected after the batch
+		// commits by handleChangeReplicasResult() to finish the removal.
 		//
-		// NB: we must be holding the raftMu here because we're in the
-		// midst of application.
+		// NB: we must be holding the raftMu here because we're in the midst of
+		// application.
 		b.r.mu.Lock()
 		b.r.mu.destroyStatus.Set(
 			roachpb.NewRangeNotFoundError(b.r.RangeID, b.r.store.StoreID()),

diff --git a/pkg/kv/kvserver/replica_destroy.go b/pkg/kv/kvserver/replica_destroy.go
@@ -80,7 +80,17 @@ func (r *Replica) preDestroyRaftMuLocked(
 	clearRangeIDLocalOnly bool,
 	mustUseClearRange bool,
 ) error {
-	desc := r.Desc()
+	r.mu.RLock()
+	desc := r.descRLocked()
+	removed := r.mu.destroyStatus.Removed()
+	r.mu.RUnlock()
+
+	// The replica must be marked as destroyed before its data is removed. If
+	// not, we risk new commands being accepted and observing the missing data.
+	if !removed {
+		log.Fatalf(ctx, "replica not marked as destroyed before call to preDestroyRaftMuLocked: %v", r)
+	}
+
 	err := clearRangeData(desc, reader, writer, clearRangeIDLocalOnly, mustUseClearRange)
 	if err != nil {
 		return err

diff --git a/pkg/kv/kvserver/replica_raftstorage.go b/pkg/kv/kvserver/replica_raftstorage.go
@@ -1032,6 +1032,15 @@ func (r *Replica) clearSubsumedReplicaDiskData(
 	keyRanges := getKeyRanges(desc)
 	totalKeyRanges := append([]rditer.KeyRange(nil), keyRanges[:]...)
 	for _, sr := range subsumedRepls {
+		// We mark the replica as destroyed so that new commands are not
+		// accepted. This destroy status will be detected after the batch
+		// commits by clearSubsumedReplicaInMemoryData() to finish the removal.
+		sr.mu.Lock()
+		sr.mu.destroyStatus.Set(
+			roachpb.NewRangeNotFoundError(sr.RangeID, sr.store.StoreID()),
+			destroyReasonRemoved)
+		sr.mu.Unlock()
+
 		// We have to create an SST for the subsumed replica's range-id local keys.
 		subsumedReplSSTFile := &storage.MemFile{}
 		subsumedReplSST := storage.MakeIngestionSSTWriter(subsumedReplSSTFile)
@@ -1146,7 +1155,8 @@ func (r *Replica) clearSubsumedReplicaInMemoryData(
 		// replicas themselves is protected by their raftMus, which are held from
 		// start to finish.
 		if err := r.store.removeInitializedReplicaRaftMuLocked(ctx, sr, subsumedNextReplicaID, RemoveOptions{
-			DestroyData: false, // data is already destroyed
+			// The data was already destroyed by clearSubsumedReplicaDiskData.
+			DestroyData: false,
 		}); err != nil {
 			return err
 		}

diff --git a/pkg/kv/kvserver/store_merge.go b/pkg/kv/kvserver/store_merge.go
@@ -51,7 +51,9 @@ func (s *Store) MergeRange(
 	// call removeInitializedReplicaRaftMuLocked directly to avoid deadlocking
 	// on the right-hand replica's raftMu.
 	if err := s.removeInitializedReplicaRaftMuLocked(ctx, rightRepl, rightDesc.NextReplicaID, RemoveOptions{
-		DestroyData: false, // the replica was destroyed when the merge commit applied
+		// The replica was destroyed by the tombstones added to the batch in
+		// runPreApplyTriggersAfterStagingWriteBatch.
+		DestroyData: false,
 	}); err != nil {
 		return errors.Errorf("cannot remove range: %s", err)
 	}

diff --git a/pkg/kv/kvserver/store_remove_replica.go b/pkg/kv/kvserver/store_remove_replica.go
@@ -21,13 +21,8 @@ import (
 
 // RemoveOptions bundles boolean parameters for Store.RemoveReplica.
 type RemoveOptions struct {
+	// If true, the replica's destroyStatus must be marked as removed.
 	DestroyData bool
-
-	// ignoreDestroyStatus allows a caller to instruct the store to remove
-	// replicas which are already marked as destroyed. This is helpful in cases
-	// where the caller knows that it set the destroy status and cannot have raced
-	// with another goroutine. See Replica.handleChangeReplicasResult().
-	ignoreDestroyStatus bool
 }
 
 // RemoveReplica removes the replica from the store's replica map and from the
@@ -78,10 +73,19 @@ func (s *Store) removeInitializedReplicaRaftMuLocked(
 	{
 		rep.mu.Lock()
 
-		// Detect if we were already removed.
-		if !opts.ignoreDestroyStatus && rep.mu.destroyStatus.Removed() {
-			rep.mu.Unlock()
-			return nil // already removed, noop
+		if opts.DestroyData {
+			// Detect if we were already removed.
+			if rep.mu.destroyStatus.Removed() {
+				rep.mu.Unlock()
+				return nil // already removed, noop
+			}
+		} else {
+			// If the caller doesn't want to destroy the data because it already
+			// has done so, then it must have already also set the destroyStatus.
+			if !rep.mu.destroyStatus.Removed() {
+				rep.mu.Unlock()
+				log.Fatalf(ctx, "replica not marked as destroyed but data already destroyed: %v", rep)
+			}
 		}
 
 		desc = rep.mu.state.Desc
@@ -96,8 +100,7 @@ func (s *Store) removeInitializedReplicaRaftMuLocked(
 		/// uninitialized.
 		if !rep.isInitializedRLocked() {
 			rep.mu.Unlock()
-			log.Fatalf(ctx, "uninitialized replica cannot be removed with removeInitializedReplica: %v",
-				rep)
+			log.Fatalf(ctx, "uninitialized replica cannot be removed with removeInitializedReplica: %v", rep)
 		}
 
 		// Mark the replica as removed before deleting data.