diff --git a/pkg/kv/kvserver/client_replica_test.go b/pkg/kv/kvserver/client_replica_test.go index 9e0e2dffb00f..28c7356abcf0 100644 --- a/pkg/kv/kvserver/client_replica_test.go +++ b/pkg/kv/kvserver/client_replica_test.go @@ -3211,7 +3211,9 @@ func TestStrictGCEnforcement(t *testing.T) { _, r := getFirstStoreReplica(t, s, tableKey) if _, z := r.DescAndZone(); z.GC.TTLSeconds != int32(exp) { _, sysCfg := getFirstStoreReplica(t, tc.Server(i), keys.SystemConfigSpan.Key) - require.NoError(t, sysCfg.MaybeGossipSystemConfig(ctx)) + sysCfg.RaftLock() + require.NoError(t, sysCfg.MaybeGossipSystemConfigRaftMuLocked(ctx)) + sysCfg.RaftUnlock() return errors.Errorf("expected %d, got %d", exp, z.GC.TTLSeconds) } } @@ -3225,7 +3227,9 @@ func TestStrictGCEnforcement(t *testing.T) { for i := 0; i < tc.NumServers(); i++ { s, r := getFirstStoreReplica(t, tc.Server(i), keys.SystemConfigSpan.Key) if kvserver.StrictGCEnforcement.Get(&s.ClusterSettings().SV) != val { - require.NoError(t, r.MaybeGossipSystemConfig(ctx)) + r.RaftLock() + require.NoError(t, r.MaybeGossipSystemConfigRaftMuLocked(ctx)) + r.RaftUnlock() return errors.Errorf("expected %v, got %v", val, !val) } } diff --git a/pkg/kv/kvserver/replica_gossip.go b/pkg/kv/kvserver/replica_gossip.go index 384360f2dd69..8e265d27f8b8 100644 --- a/pkg/kv/kvserver/replica_gossip.go +++ b/pkg/kv/kvserver/replica_gossip.go @@ -64,20 +64,23 @@ func (r *Replica) shouldGossip(ctx context.Context) bool { return r.OwnsValidLease(ctx, r.store.Clock().NowAsClockTimestamp()) } -// MaybeGossipSystemConfig scans the entire SystemConfig span and gossips it. -// Further calls come from the trigger on EndTxn or range lease acquisition. +// MaybeGossipSystemConfigRaftMuLocked scans the entire SystemConfig span and +// gossips it. Further calls come from the trigger on EndTxn or range lease +// acquisition. // -// Note that MaybeGossipSystemConfig gossips information only when the -// lease is actually held. The method does not request a range lease -// here since RequestLease and applyRaftCommand call the method and we -// need to avoid deadlocking in redirectOnOrAcquireLease. +// Note that MaybeGossipSystemConfigRaftMuLocked gossips information only when +// the lease is actually held. The method does not request a range lease here +// since RequestLease and applyRaftCommand call the method and we need to avoid +// deadlocking in redirectOnOrAcquireLease. // -// MaybeGossipSystemConfig must only be called from Raft commands -// (which provide the necessary serialization to avoid data races). +// MaybeGossipSystemConfigRaftMuLocked must only be called from Raft commands +// while holding the raftMu (which provide the necessary serialization to avoid +// data races). // -// TODO(nvanbenschoten,bdarnell): even though this is best effort, we -// should log louder when we continually fail to gossip system config. -func (r *Replica) MaybeGossipSystemConfig(ctx context.Context) error { +// TODO(nvanbenschoten,bdarnell): even though this is best effort, we should log +// louder when we continually fail to gossip system config. +func (r *Replica) MaybeGossipSystemConfigRaftMuLocked(ctx context.Context) error { + r.raftMu.AssertHeld() if r.store.Gossip() == nil { log.VEventf(ctx, 2, "not gossiping system config because gossip isn't initialized") return nil @@ -124,30 +127,36 @@ func (r *Replica) MaybeGossipSystemConfig(ctx context.Context) error { return nil } -// MaybeGossipSystemConfigIfHaveFailure is a trigger to gossip the system config -// due to an abort of a transaction keyed in the system config span. It will -// call MaybeGossipSystemConfig if failureToGossipSystemConfig is true. -func (r *Replica) MaybeGossipSystemConfigIfHaveFailure(ctx context.Context) error { +// MaybeGossipSystemConfigIfHaveFailureRaftMuLocked is a trigger to gossip the +// system config due to an abort of a transaction keyed in the system config +// span. It will call MaybeGossipSystemConfigRaftMuLocked if +// failureToGossipSystemConfig is true. +func (r *Replica) MaybeGossipSystemConfigIfHaveFailureRaftMuLocked(ctx context.Context) error { r.mu.RLock() failed := r.mu.failureToGossipSystemConfig r.mu.RUnlock() if !failed { return nil } - return r.MaybeGossipSystemConfig(ctx) + return r.MaybeGossipSystemConfigRaftMuLocked(ctx) } -// MaybeGossipNodeLiveness gossips information for all node liveness -// records stored on this range. To scan and gossip, this replica -// must hold the lease to a range which contains some or all of the -// node liveness records. After scanning the records, it checks -// against what's already in gossip and only gossips records which -// are out of date. -func (r *Replica) MaybeGossipNodeLiveness(ctx context.Context, span roachpb.Span) error { +// MaybeGossipNodeLivenessRaftMuLocked gossips information for all node liveness +// records stored on this range. To scan and gossip, this replica must hold the +// lease to a range which contains some or all of the node liveness records. +// After scanning the records, it checks against what's already in gossip and +// only gossips records which are out of date. +// +// MaybeGossipNodeLivenessRaftMuLocked must only be called from Raft commands +// while holding the raftMu (which provide the necessary serialization to avoid +// data races). +func (r *Replica) MaybeGossipNodeLivenessRaftMuLocked( + ctx context.Context, span roachpb.Span, +) error { + r.raftMu.AssertHeld() if r.store.Gossip() == nil || !r.IsInitialized() { return nil } - if !r.ContainsKeyRange(span.Key, span.EndKey) || !r.shouldGossip(ctx) { return nil } diff --git a/pkg/kv/kvserver/replica_proposal.go b/pkg/kv/kvserver/replica_proposal.go index 5bd7f506744f..7ce8818c310c 100644 --- a/pkg/kv/kvserver/replica_proposal.go +++ b/pkg/kv/kvserver/replica_proposal.go @@ -503,10 +503,17 @@ func (r *Replica) leasePostApplyLocked( // NB: run these in an async task to keep them out of the critical section // (r.mu is held here). _ = r.store.stopper.RunAsyncTask(ctx, "lease-triggers", func(ctx context.Context) { - if err := r.MaybeGossipSystemConfig(ctx); err != nil { + // Re-acquire the raftMu, as we are now in an async task. + r.raftMu.Lock() + defer r.raftMu.Unlock() + if _, err := r.IsDestroyed(); err != nil { + // Nothing to do. + return + } + if err := r.MaybeGossipSystemConfigRaftMuLocked(ctx); err != nil { log.Errorf(ctx, "%v", err) } - if err := r.MaybeGossipNodeLiveness(ctx, keys.NodeLivenessSpan); err != nil { + if err := r.MaybeGossipNodeLivenessRaftMuLocked(ctx, keys.NodeLivenessSpan); err != nil { log.Errorf(ctx, "%v", err) } @@ -700,21 +707,21 @@ func (r *Replica) handleReadWriteLocalEvalResult(ctx context.Context, lResult re } if lResult.MaybeGossipSystemConfig { - if err := r.MaybeGossipSystemConfig(ctx); err != nil { + if err := r.MaybeGossipSystemConfigRaftMuLocked(ctx); err != nil { log.Errorf(ctx, "%v", err) } lResult.MaybeGossipSystemConfig = false } if lResult.MaybeGossipSystemConfigIfHaveFailure { - if err := r.MaybeGossipSystemConfigIfHaveFailure(ctx); err != nil { + if err := r.MaybeGossipSystemConfigIfHaveFailureRaftMuLocked(ctx); err != nil { log.Errorf(ctx, "%v", err) } lResult.MaybeGossipSystemConfigIfHaveFailure = false } if lResult.MaybeGossipNodeLiveness != nil { - if err := r.MaybeGossipNodeLiveness(ctx, *lResult.MaybeGossipNodeLiveness); err != nil { + if err := r.MaybeGossipNodeLivenessRaftMuLocked(ctx, *lResult.MaybeGossipNodeLiveness); err != nil { log.Errorf(ctx, "%v", err) } lResult.MaybeGossipNodeLiveness = nil