Skip to content

Commit

Permalink
Merge #42068
Browse files Browse the repository at this point in the history
42068: storage: check whether the acquiring replica can receive the lease r=ajwerner a=ajwerner

Before this PR, when evaluating lease transfers or requests we'd check whether
the evaluating replica could receive the lease rather than the acquiring
replica. This patch fixes that bug by checking the state of the correct
replica.

Fixes #38720.

Release note (bug fix): The system no longer erroneously transfers leases
to replicas which are in the process of being removed which can lead to
ranges being effectively unavailable due to an invalid lease.

Co-authored-by: Andrew Werner <ajwerner@cockroachlabs.com>
  • Loading branch information
craig[bot] and ajwerner committed Oct 31, 2019
2 parents c721dc9 + 23cbf6c commit cb127fd
Show file tree
Hide file tree
Showing 6 changed files with 146 additions and 17 deletions.
13 changes: 8 additions & 5 deletions pkg/storage/batcheval/cmd_lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,14 @@ func newFailedLeaseTrigger(isTransfer bool) result.Result {
return trigger
}

func checkCanReceiveLease(rec EvalContext) error {
repDesc, ok := rec.Desc().GetReplicaDescriptor(rec.StoreID())
func checkCanReceiveLease(newLease *roachpb.Lease, rec EvalContext) error {
repDesc, ok := rec.Desc().GetReplicaDescriptorByID(newLease.Replica.ReplicaID)
if !ok {
return errors.AssertionFailedf(
`could not find replica for store %s in %s`, rec.StoreID(), rec.Desc())
if newLease.Replica.StoreID == rec.StoreID() {
return errors.AssertionFailedf(
`could not find replica for store %s in %s`, rec.StoreID(), rec.Desc())
}
return errors.Errorf(`replica %s not found in %s`, newLease.Replica, rec.Desc())
} else if t := repDesc.GetType(); t != roachpb.VOTER_FULL {
// NB: there's no harm in transferring the lease to a VOTER_INCOMING,
// but we disallow it anyway. On the other hand, transferring to
Expand All @@ -66,7 +69,7 @@ func checkCanReceiveLease(rec EvalContext) error {
// of minProposedTS needs to be "reversible" (tricky) or we make the
// lease evaluation succeed, though with a lease that's "invalid" so that
// a new lease can be requested right after.
return errors.Errorf(`replica of type %s cannot hold lease`, t)
return errors.Errorf(`replica %s of type %s cannot hold lease`, repDesc, t)
}
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/batcheval/cmd_lease_request.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func RequestLease(
//
// If this check is removed at some point, the filtering of learners on the
// sending side would have to be removed as well.
if err := checkCanReceiveLease(cArgs.EvalCtx); err != nil {
if err := checkCanReceiveLease(&args.Lease, cArgs.EvalCtx); err != nil {
rErr.Message = err.Error()
return newFailedLeaseTrigger(false /* isTransfer */), rErr
}
Expand Down
32 changes: 24 additions & 8 deletions pkg/storage/batcheval/cmd_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,29 +113,45 @@ func TestLeaseCommandLearnerReplica(t *testing.T) {
ctx := context.Background()
const voterStoreID, learnerStoreID roachpb.StoreID = 1, 2
replicas := []roachpb.ReplicaDescriptor{
{StoreID: voterStoreID, Type: roachpb.ReplicaTypeVoterFull()},
{StoreID: learnerStoreID, Type: roachpb.ReplicaTypeLearner()},
{NodeID: 1, StoreID: voterStoreID, Type: roachpb.ReplicaTypeVoterFull(), ReplicaID: 1},
{NodeID: 2, StoreID: learnerStoreID, Type: roachpb.ReplicaTypeLearner(), ReplicaID: 2},
}
desc := roachpb.RangeDescriptor{}
desc.SetReplicas(roachpb.MakeReplicaDescriptors(replicas))
cArgs := CommandArgs{
EvalCtx: &mockEvalCtx{
storeID: learnerStoreID,
storeID: voterStoreID,
desc: &desc,
},
Args: &roachpb.TransferLeaseRequest{},
Args: &roachpb.TransferLeaseRequest{
Lease: roachpb.Lease{
Replica: replicas[1],
},
},
}

// Learners are not allowed to become leaseholders for now, see the comments
// in TransferLease and RequestLease.
_, err := TransferLease(ctx, nil, cArgs, nil)
require.EqualError(t, err, `replica of type LEARNER cannot hold lease`)
require.EqualError(t, err, `replica (n2,s2):2LEARNER of type LEARNER cannot hold lease`)

cArgs.Args = &roachpb.RequestLeaseRequest{}
_, err = RequestLease(ctx, nil, cArgs, nil)

const exp = `cannot replace lease repl=(n0,s0):? seq=0 start=0.000000000,0 exp=<nil> ` +
const expForUnknown = `cannot replace lease repl=(n0,s0):? seq=0 start=0.000000000,0 exp=<nil> ` +
`with repl=(n0,s0):? seq=0 start=0.000000000,0 exp=<nil>: ` +
`replica of type LEARNER cannot hold lease`
require.EqualError(t, err, exp)
`replica (n0,s0):? not found in r0:{-} [(n1,s1):1, (n2,s2):2LEARNER, next=0, gen=0?]`
require.EqualError(t, err, expForUnknown)

cArgs.Args = &roachpb.RequestLeaseRequest{
Lease: roachpb.Lease{
Replica: replicas[1],
},
}
_, err = RequestLease(ctx, nil, cArgs, nil)

const expForLearner = `cannot replace lease repl=(n0,s0):? seq=0 start=0.000000000,0 exp=<nil> ` +
`with repl=(n2,s2):2LEARNER seq=0 start=0.000000000,0 exp=<nil>: ` +
`replica (n2,s2):2LEARNER of type LEARNER cannot hold lease`
require.EqualError(t, err, expForLearner)
}
2 changes: 1 addition & 1 deletion pkg/storage/batcheval/cmd_lease_transfer.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func TransferLease(
//
// If this check is removed at some point, the filtering of learners on the
// sending side would have to be removed as well.
if err := checkCanReceiveLease(cArgs.EvalCtx); err != nil {
if err := checkCanReceiveLease(&args.Lease, cArgs.EvalCtx); err != nil {
return newFailedLeaseTrigger(true /* isTransfer */), err
}

Expand Down
110 changes: 110 additions & 0 deletions pkg/storage/client_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,26 @@ import (
"context"
"errors"
"fmt"
"runtime"
"sync"
"sync/atomic"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/config"
"github.com/cockroachdb/cockroach/pkg/gossip"
"github.com/cockroachdb/cockroach/pkg/internal/client"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/storage/storagebase"
"github.com/cockroachdb/cockroach/pkg/storage/storagepb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/stretchr/testify/require"
)

// TestStoreRangeLease verifies that regular ranges (not some special ones at
Expand Down Expand Up @@ -301,3 +309,105 @@ func TestGossipNodeLivenessOnLeaseChange(t *testing.T) {
return nil
})
}

// TestCannotTransferLeaseToVoterOutgoing ensures that the evaluation of lease
// requests for nodes which are already in the VOTER_OUTGOING state will fail.
func TestCannotTransferLeaseToVoterOutgoing(t *testing.T) {
defer leaktest.AfterTest(t)()
ctx := context.Background()

knobs, ltk := makeReplicationTestKnobs()
// Add a testing knob to allow us to block the change replicas command
// while it is being proposed. When we detect that the change replicas
// command to move n3 to VOTER_OUTGOING has been evaluated, we'll send
// the request to transfer the lease to n3. The hope is that it will
// get past the sanity above latch acquisition prior to change replicas
// command committing.
var scratchRangeID atomic.Value
scratchRangeID.Store(roachpb.RangeID(0))
changeReplicasChan := make(chan chan struct{}, 1)
shouldBlock := func(args storagebase.ProposalFilterArgs) bool {
// Block if a ChangeReplicas command is removing a node from our range.
return args.Req.RangeID == scratchRangeID.Load().(roachpb.RangeID) &&
args.Cmd.ReplicatedEvalResult.ChangeReplicas != nil &&
len(args.Cmd.ReplicatedEvalResult.ChangeReplicas.Removed()) > 0
}
blockIfShould := func(args storagebase.ProposalFilterArgs) {
if shouldBlock(args) {
ch := make(chan struct{})
changeReplicasChan <- ch
<-ch
}
}
knobs.Store.(*storage.StoreTestingKnobs).TestingProposalFilter = func(args storagebase.ProposalFilterArgs) *roachpb.Error {
blockIfShould(args)
return nil
}
tc := testcluster.StartTestCluster(t, 4, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{Knobs: knobs},
ReplicationMode: base.ReplicationManual,
})
defer tc.Stopper().Stop(ctx)

scratchStartKey := tc.ScratchRange(t)
desc := tc.AddReplicasOrFatal(t, scratchStartKey, tc.Targets(1, 2)...)
scratchRangeID.Store(desc.RangeID)
// Make sure n1 has the lease to start with.
err := tc.Server(0).DB().AdminTransferLease(context.TODO(),
scratchStartKey, tc.Target(0).StoreID)
require.NoError(t, err)

// The test proceeds as follows:
//
// - Send an AdminChangeReplicasRequest to remove n3 and add n4
// - Block the step that moves n3 to VOTER_OUTGOING on changeReplicasChan
// - Send an AdminLeaseTransfer to make n3 the leaseholder
// - Try really hard to make sure that the lease transfer at least gets to
// latch acquisition before unblocking the ChangeReplicas.
// - Unblock the ChangeReplicas.
// - Make sure the lease transfer fails.

ltk.withStopAfterJointConfig(func() {
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
_, err = tc.Server(0).DB().AdminChangeReplicas(
client.ChangeReplicasCanMixAddAndRemoveContext(ctx),
scratchStartKey, desc, []roachpb.ReplicationChange{
{ChangeType: roachpb.REMOVE_REPLICA, Target: tc.Target(2)},
{ChangeType: roachpb.ADD_REPLICA, Target: tc.Target(3)},
})
require.NoError(t, err)
}()
ch := <-changeReplicasChan
wg.Add(1)
go func() {
defer wg.Done()
err := tc.Server(0).DB().AdminTransferLease(context.TODO(),
scratchStartKey, tc.Target(2).StoreID)
require.Error(t, err)
require.Regexp(t,
// The error generated during evaluation.
"replica.*of type VOTER_OUTGOING cannot hold lease|"+
// If the lease transfer request has not yet made it to the latching
// phase by the time we close(ch) below, we can receive the following
// error due to the sanity checking which happens in
// AdminTransferLease before attempting to evaluate the lease
// transfer.
// We have a sleep loop below to try to encourage the lease transfer
// to make it past that sanity check prior to letting the change
// of replicas proceed.
"cannot transfer lease to replica of type VOTER_OUTGOING", err.Error())
}()
// Try really hard to make sure that our request makes it past the
// sanity check error to the evaluation error.
for i := 0; i < 100; i++ {
runtime.Gosched()
time.Sleep(time.Microsecond)
}
close(ch)
wg.Wait()
})

}
4 changes: 2 additions & 2 deletions pkg/storage/replica_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1058,7 +1058,7 @@ func TestReplicaLease(t *testing.T) {
Args: &roachpb.RequestLeaseRequest{
Lease: lease,
},
}, &roachpb.RequestLeaseResponse{}); !testutils.IsError(err, "illegal lease") {
}, &roachpb.RequestLeaseResponse{}); !testutils.IsError(err, "replica \\(n0,s0\\):\\? not found in r1") {
t.Fatalf("unexpected error: %+v", err)
}
}
Expand Down Expand Up @@ -1463,7 +1463,7 @@ func TestReplicaLeaseRejectUnknownRaftNodeID(t *testing.T) {
// Remove ambiguity about where the "replica not found" error comes from.
pErr = (<-ch).Err
}
if !testutils.IsPError(pErr, "replica not found") {
if !testutils.IsPError(pErr, "replica.*not found") {
t.Errorf("unexpected error obtaining lease for invalid store: %v", pErr)
}
}
Expand Down

0 comments on commit cb127fd

Please sign in to comment.