diff --git a/pkg/storage/replicate_queue.go b/pkg/storage/replicate_queue.go index a672d14ad6a8..dcb0571d0ae5 100644 --- a/pkg/storage/replicate_queue.go +++ b/pkg/storage/replicate_queue.go @@ -508,9 +508,17 @@ func (rq *replicateQueue) findRemoveTarget( return rq.allocator.RemoveTarget(ctx, zone, candidates, existingReplicas) } +// maybeTransferLeaseAway is called whenever a replica on a given store is +// slated for removal. If the store corresponds to the store of the caller +// (which is very likely to be the leaseholder), then this removal would fail. +// Instead, this method will attempt to transfer the lease away, and returns +// true to indicate to the caller that it should not pursue the current +// replication change further because it is no longer the leaseholder. When the +// returned bool is false, it should continue. On error, the caller should also +// stop. func (rq *replicateQueue) maybeTransferLeaseAway( ctx context.Context, repl *Replica, removeStoreID roachpb.StoreID, dryRun bool, -) (transferred bool, _ error) { +) (done bool, _ error) { if removeStoreID != repl.store.StoreID() { return false, nil } @@ -710,20 +718,49 @@ func (rq *replicateQueue) considerRebalance( } else { // We have a replica to remove and one we can add, so let's swap them // out. + chgs := []roachpb.ReplicationChange{ + // NB: we place the addition first because in the case of + // atomic replication changes being turned off, the changes + // will be executed individually in the order in which they + // appear. + {Target: addTarget, ChangeType: roachpb.ADD_REPLICA}, + {Target: removeTarget, ChangeType: roachpb.REMOVE_REPLICA}, + } + + if len(existingReplicas) == 1 { + // If there's only one replica, the removal target is the + // leaseholder and this is unsupported and will fail. However, + // this is also the only way to rebalance in a single-replica + // range. If we try the atomic swap here, we'll fail doing + // nothing, and so we stay locked into the current distribution + // of replicas. (Note that maybeTransferLeaseAway above will not + // have found a target, and so will have returned (false, nil). + // + // Do the best thing we can, which is carry out the addition + // only, which should succeed, and the next time we touch this + // range, we will have one more replica and hopefully it will + // take the lease and remove the current leaseholder. + // + // It's possible that "rebalancing deadlock" can occur in other + // scenarios, it's really impossible to tell from the code given + // the constraints we support. However, the lease transfer often + // does not happen spuriously, and we can't enter dangerous + // configurations sporadically, so this code path is only hit + // when we know it's necessary, picking the smaller of two evils. + // + // See https://github.com/cockroachdb/cockroach/issues/40333. + chgs = chgs[:1] + log.VEventf(ctx, 1, "can't swap replica due to lease; falling back to add") + } + rq.metrics.RebalanceReplicaCount.Inc(1) log.VEventf(ctx, 1, "rebalancing %+v to %+v: %s", removeTarget, addTarget, rangeRaftProgress(repl.RaftStatus(), existingReplicas)) + if err := rq.changeReplicas( ctx, repl, - []roachpb.ReplicationChange{ - // NB: we place the addition first because in the case of - // atomic replication changes being turned off, the changes - // will be executed individually in the order in which they - // appear. - {Target: addTarget, ChangeType: roachpb.ADD_REPLICA}, - {Target: removeTarget, ChangeType: roachpb.REMOVE_REPLICA}, - }, + chgs, desc, SnapshotRequest_REBALANCE, storagepb.ReasonRebalance,