Skip to content

Commit

Permalink
[WIP] storage: detect snapshots intended for a different replica ID
Browse files Browse the repository at this point in the history
Before this change we'd apply a snapshot over an existing initialized replica
regardless of the replica ID its descriptor contained for the recipient store.
This logic is problematic if the current store was removed and re-added and in
the meantime the range subsumed one or more other ranges. This is specifically
problematic because the code assumes in maybeAcquireSnapshotMergeLock that if
the range descriptor grows then there will be collocated replicas to subsume.
This invariant doesn't hold if the replica was removed and then added back
after the range is modified.

Release note: None
  • Loading branch information
ajwerner committed Sep 4, 2019
1 parent b4caac8 commit 2787eb0
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 6 deletions.
10 changes: 10 additions & 0 deletions pkg/storage/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -3464,6 +3464,16 @@ func (s *Store) processRaftSnapshotRequest(
}
return nil
}(); err != nil {
// If the replica was destroyed then it must be deleted.
// TODO(ajwerner): returning an error here throws away this fresh
// snapshot. Ideally we'd be able to just use it.
if _, isReplicaTooOld := err.(*roachpb.ReplicaTooOldError); isReplicaTooOld {
if removeErr := s.removeReplicaImpl(ctx, r, snapHeader.State.Desc.NextReplicaID, RemoveOptions{
DestroyData: true,
}); removeErr != nil {
log.Infof(ctx, "error: failed to destroy replica: %v", removeErr)
}
}
return roachpb.NewError(err)
}

Expand Down
29 changes: 23 additions & 6 deletions pkg/storage/store_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,8 @@ func (s *Store) reserveSnapshot(
// this store's replica (i.e. the snapshot is not from an older incarnation of
// the replica) and a placeholder can be added to the replicasByKey map (if
// necessary). If a placeholder is required, it is returned as the first value.
// If a ReplicaTooOldError is returned then the existing replica should be
// destroyed.
//
// Both the store mu (and the raft mu for an existing replica if there is one)
// must be held.
Expand Down Expand Up @@ -628,21 +630,36 @@ func (s *Store) canApplySnapshotLocked(
existingRepl.raftMu.AssertHeld()

existingRepl.mu.RLock()
existingIsInitialized := existingRepl.isInitializedRLocked()
existingDesc := existingRepl.mu.state.Desc
existingIsInitialized := existingDesc.IsInitialized()
existingRepl.mu.RUnlock()

if existingIsInitialized {
// Regular Raft snapshots can't be refused at this point,
// even if they widen the existing replica. See the comments
// in Replica.maybeAcquireSnapshotMergeLock for how this is
// made safe.
// Regular Raft snapshots can't be refused at this point unless it's telling
// us we've since been removed and re-added.
// Otherwise we can't refuse this snapshot even if they widen the existing
// replica. See the comments in Replica.maybeAcquireSnapshotMergeLock for
// how this is made safe.
//
// NB: we expect the replica to know its replicaID at this point
// (i.e. !existingIsPreemptive), though perhaps it's possible
// that this isn't true if the leader initiates a Raft snapshot
// (that would provide a range descriptor with this replica in
// it) but this node reboots (temporarily forgetting its
// replicaID) before the snapshot arrives.

// If the snapshot's replica ID differs from the existing replica's ID then
// this replica must been removed and re-added rapidly. Return a
// ReplicaTooOldError to imply that this replica's data needs to be
// destroyed.
existingReplicaDesc, storeHasReplica := existingDesc.GetReplicaDescriptor(s.Ident.StoreID)
snapReplicaDesc, snapHasReplica := desc.GetReplicaDescriptor(s.Ident.StoreID)
if storeHasReplica && snapHasReplica && snapReplicaDesc.ReplicaID != existingReplicaDesc.ReplicaID {
return nil, &roachpb.ReplicaTooOldError{
ReplicaID: existingReplicaDesc.ReplicaID,
}
}

return nil, nil
}

Expand All @@ -661,7 +678,7 @@ func (s *Store) canApplySnapshotLocked(

// checkSnapshotOverlapLocked returns an error if the snapshot overlaps an
// existing replica or placeholder. Any replicas that do overlap have a good
// chance of being abandoned, so they're proactively handed to the GC queue .
// chance of being abandoned, so they're proactively handed to the GC queue.
func (s *Store) checkSnapshotOverlapLocked(
ctx context.Context, snapHeader *SnapshotRequest_Header,
) error {
Expand Down

0 comments on commit 2787eb0

Please sign in to comment.