Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release-2.1: storage: Don't swallow ErrEpochAlreadyIncremented #36960

Merged
merged 1 commit into from
Apr 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions pkg/storage/node_liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ var (
// the underlying liveness record has had its epoch incremented.
ErrEpochIncremented = errors.New("heartbeat failed on epoch increment")

// ErrEpochAlreadyIncremented is returned by IncrementEpoch when
// someone else has already incremented the epoch to the desired
// value.
ErrEpochAlreadyIncremented = errors.New("epoch already incremented")

errLiveClockNotLive = errors.New("not live")
)

Expand Down Expand Up @@ -712,8 +717,6 @@ func (nl *NodeLiveness) getLivenessLocked(nodeID roachpb.NodeID) (*Liveness, err
return nil, ErrNoLivenessRecord
}

var errEpochAlreadyIncremented = errors.New("epoch already incremented")

// IncrementEpoch is called to increment the current liveness epoch,
// thereby invalidating anything relying on the liveness of the
// previous epoch. This method does a conditional put on the node
Expand Down Expand Up @@ -741,15 +744,12 @@ func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness *Liveness)
if err := nl.updateLiveness(ctx, update, liveness, func(actual Liveness) error {
defer nl.maybeUpdate(actual)
if actual.Epoch > liveness.Epoch {
return errEpochAlreadyIncremented
return ErrEpochAlreadyIncremented
} else if actual.Epoch < liveness.Epoch {
return errors.Errorf("unexpected liveness epoch %d; expected >= %d", actual.Epoch, liveness.Epoch)
}
return errors.Errorf("mismatch incrementing epoch for %+v; actual is %+v", *liveness, actual)
}); err != nil {
if err == errEpochAlreadyIncremented {
return nil
}
return err
}

Expand Down
6 changes: 3 additions & 3 deletions pkg/storage/node_liveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,8 @@ func TestNodeLivenessEpochIncrement(t *testing.T) {
t.Errorf("expected epoch increment == 1; got %d", c)
}

// Verify noop on incrementing an already-incremented epoch.
if err := mtc.nodeLivenesses[0].IncrementEpoch(context.Background(), oldLiveness); err != nil {
// Verify error on incrementing an already-incremented epoch.
if err := mtc.nodeLivenesses[0].IncrementEpoch(context.Background(), oldLiveness); err != storage.ErrEpochAlreadyIncremented {
t.Fatalf("unexpected error incrementing a non-live node: %s", err)
}

Expand Down Expand Up @@ -603,7 +603,7 @@ func TestNodeLivenessConcurrentIncrementEpochs(t *testing.T) {
}()
}
for i := 0; i < concurrency; i++ {
if err := <-errCh; err != nil {
if err := <-errCh; err != nil && err != storage.ErrEpochAlreadyIncremented {
t.Fatalf("concurrent increment epoch %d failed: %s", i, err)
}
}
Expand Down
33 changes: 32 additions & 1 deletion pkg/storage/replica_range_lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,38 @@ func (p *pendingLeaseRequest) requestLeaseAsync(
log.Info(ctx, err)
}
} else if err = p.repl.store.cfg.NodeLiveness.IncrementEpoch(ctx, status.Liveness); err != nil {
log.Error(ctx, err)
// If we get ErrEpochAlreadyIncremented, someone else beat
// us to it. This proves that the target node is truly
// dead *now*, but it doesn't prove that it was dead at
// status.Timestamp (which we've encoded into our lease
// request). It's possible that the node was temporarily
// considered dead but revived without having its epoch
// incremented, i.e. that it was in fact live at
// status.Timestamp.
//
// It would be incorrect to simply proceed to sending our
// lease request since our lease.Start may precede the
// effective end timestamp of the predecessor lease (the
// expiration of the last successful heartbeat before the
// epoch increment), and so under this lease this node's
// timestamp cache would not necessarily reflect all reads
// served by the prior leaseholder.
//
// It would be correct to bump the timestamp in the lease
// request and proceed, but that just sets up another race
// between this node and the one that already incremented
// the epoch. They're probably going to beat us this time
// too, so just return the NotLeaseHolderError here
// instead of trying to fix up the timestamps and submit
// the lease request.
//
// ErrEpochAlreadyIncremented is not an unusual situation,
// so we don't log it as an error.
//
// https://github.com/cockroachdb/cockroach/issues/35986
if err != ErrEpochAlreadyIncremented {
log.Error(ctx, err)
}
}
}
// Set error for propagation to all waiters below.
Expand Down