Skip to content

Commit

Permalink
kvserver: make consistency check symmetrical
Browse files Browse the repository at this point in the history
Previously, a consistency check would fail if the local replica failed to
compute the checksum. This makes the consistency checks not resilient to issues
with the leader.

The only place depending on this assertion is the MVCC stats delta sub-check of
the consistency check. It is not strictly required to take delta from the local
replica for this purpose, any successful response from other replicas can be
used too.

This commit makes it so that any successful checksum computation can be
examined. As a result, all replicas are treated symmetrically, so the
consistency check is more resilient to leader issues.

Release note: None
  • Loading branch information
pav-kv committed Sep 20, 2022
1 parent 2c30691 commit 58212d4
Showing 1 changed file with 22 additions and 20 deletions.
42 changes: 22 additions & 20 deletions pkg/kv/kvserver/replica_consistency.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,12 +180,23 @@ func (r *Replica) checkConsistencyImpl(
}
res.Detail += buf.String()
} else {
// The Persisted stats are covered by the SHA computation, so if all the
// hashes match, we can take an arbitrary one that succeeded.
res.Detail += fmt.Sprintf("stats: %+v\n", results[0].Response.Persisted)
}
for _, result := range missing {
res.Detail += fmt.Sprintf("%s: error: %v\n", result.Replica, result.Err)
}

// NB: delta is examined only when minoritySHA == "", i.e. all the checksums
// match. It helps to further check that the recomputed MVCC stats match the
// stored stats.
//
// Both Persisted and Delta stats were computed deterministically from the
// data fed into the checksum, so if all checksums match, we can take the
// stats from an arbitrary replica that succeeded.
//
// TODO(pavelkalinnikov): Compare deltas to assert this assumption anyway.
delta := enginepb.MVCCStats(results[0].Response.Delta)
var haveDelta bool
{
Expand Down Expand Up @@ -329,8 +340,8 @@ func (r *Replica) collectChecksumFromReplica(

// runConsistencyCheck carries out a round of ComputeChecksum/CollectChecksum
// for the members of this range, returning the results (which it does not act
// upon). The first result will belong to the local replica, and in particular
// there is a first result when no error is returned.
// upon). Requires that the computation succeeds on at least one replica, and
// puts an arbitrary successful result first in the returned slice.
func (r *Replica) runConsistencyCheck(
ctx context.Context, req roachpb.ComputeChecksumRequest,
) ([]ConsistencyCheckResult, error) {
Expand All @@ -342,13 +353,7 @@ func (r *Replica) runConsistencyCheck(
}
ccRes := res.(*roachpb.ComputeChecksumResponse)

replSet := r.Desc().Replicas()
localReplica, found := replSet.GetReplicaDescriptorByID(r.replicaID)
if !found {
return nil, errors.New("could not get local replica descriptor")
}
replicas := replSet.Descriptors()

replicas := r.Desc().Replicas().Descriptors()
resultCh := make(chan ConsistencyCheckResult, len(replicas))
results := make([]ConsistencyCheckResult, 0, len(replicas))

Expand Down Expand Up @@ -384,22 +389,19 @@ func (r *Replica) runConsistencyCheck(
// Collect the results from all replicas, while the tasks are running.
for result := range resultCh {
results = append(results, result)
if result.Replica.IsSame(localReplica) {
// If we can't compute the local checksum, give up. This will cancel all
// the outstanding requests, and wait for the tasks above to terminate.
if err := result.Err; err != nil {
return nil, errors.Wrap(err, "computing own checksum")
}
// Put the local replica first in the list.
results[0], results[len(results)-1] = results[len(results)-1], results[0]
}
// If it was the last request, don't wait on the channel anymore.
if len(results) == len(replicas) {
break
}
}

return results, nil
// Find any successful result, and put it first.
for i, res := range results {
if res.Err == nil {
results[0], results[i] = res, results[0]
return results, nil
}
}
return nil, errors.New("could not collect checksum from any replica")
}

func (r *Replica) gcOldChecksumEntriesLocked(now time.Time) {
Expand Down

0 comments on commit 58212d4

Please sign in to comment.