Skip to content

Commit

Permalink
storage: fix split / split-queue race
Browse files Browse the repository at this point in the history
Fix a race during splits where the right-hand side of a split was
temporarily removed from the Store.mu.replicas map when the split was
being committed, opening a tiny window where that replica could be added
to a queue (e.g. the split queue) but not be present when it came time
to process the replica. The fix is to simply not perform this temporary
removal from Store.mu.replicas and instead adjust adding a replica to
Store.mu.replicas to be successful if the replica either doesn't exist
in the map, or does exist and is the same object (pointer equality).

Fixes #30660
Fixes #29144

Release note: None
  • Loading branch information
petermattis committed Sep 29, 2018
1 parent 5058787 commit d3b0e73
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 3 deletions.
13 changes: 13 additions & 0 deletions pkg/storage/client_split_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3105,3 +3105,16 @@ func TestRangeLookupAsyncResolveIntent(t *testing.T) {
t.Fatal(err)
}
}

// Verify that replicas don't temporrily disappear from the replicas map during
// the splits. See #29144.
func TestStoreSplitDisappearingReplicas(t *testing.T) {
defer leaktest.AfterTest(t)()
stopper := stop.NewStopper()
defer stopper.Stop(context.TODO())
store, _ := createTestStore(t, stopper)
go storage.WatchForDisappearingReplicas(t, store)
if err := server.WaitForInitialSplits(store.DB()); err != nil {
t.Fatal(err)
}
}
22 changes: 22 additions & 0 deletions pkg/storage/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -587,3 +587,25 @@ func WriteRandomDataToRange(
midKey = append(midKey, []byte("Z")...)
return midKey
}

func WatchForDisappearingReplicas(t testing.TB, store *Store) {
m := make(map[int64]struct{})
for {
select {
case <-store.Stopper().ShouldQuiesce():
return
default:
}

store.mu.replicas.Range(func(k int64, v unsafe.Pointer) bool {
m[k] = struct{}{}
return true
})

for k := range m {
if _, ok := store.mu.replicas.Load(k); !ok {
t.Fatalf("r%d disappeared from Store.mu.replicas map", k)
}
}
}
}
14 changes: 11 additions & 3 deletions pkg/storage/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -2452,7 +2452,11 @@ func (s *Store) SplitRange(
if exRng != rightRepl {
log.Fatalf(ctx, "found unexpected uninitialized replica: %s vs %s", exRng, rightRepl)
}
s.unlinkReplicaByRangeIDLocked(rightDesc.RangeID)
// NB: We only remove from uninitReplicas and the replicaQueues maps here
// so that we don't leave open a window where a replica is temporarily not
// present in Store.mu.replicas.
delete(s.mu.uninitReplicas, rightDesc.RangeID)
s.replicaQueues.Delete(int64(rightDesc.RangeID))
}

leftRepl.setDesc(ctx, &newLeftDesc)
Expand All @@ -2479,7 +2483,7 @@ func (s *Store) SplitRange(
leftRepl.writeStats.splitRequestCounts(rightRepl.writeStats)

if err := s.addReplicaInternalLocked(rightRepl); err != nil {
return errors.Errorf("couldn't insert range %v in replicasByKey btree: %s", rightRepl, err)
return errors.Errorf("unable to add replica %v: %s", rightRepl, err)
}

// Update the replica's cached byte thresholds. This is a no-op if the system
Expand Down Expand Up @@ -2670,7 +2674,11 @@ func (s *Store) removePlaceholderLocked(ctx context.Context, rngID roachpb.Range

// addReplicaToRangeMapLocked adds the replica to the replicas map.
func (s *Store) addReplicaToRangeMapLocked(repl *Replica) error {
if _, loaded := s.mu.replicas.LoadOrStore(int64(repl.RangeID), unsafe.Pointer(repl)); loaded {
// It's ok for the replica to exist in the replicas map as long as it is the
// same replica object. This occurs during splits where the right-hand side
// is added to the replicas map before it is initialized.
if existing, loaded := s.mu.replicas.LoadOrStore(
int64(repl.RangeID), unsafe.Pointer(repl)); loaded && (*Replica)(existing) != repl {
return errors.Errorf("%s: replica already exists", repl)
}
// Check whether the replica is unquiesced but not in the map. This
Expand Down

0 comments on commit d3b0e73

Please sign in to comment.