From 3e4adfcd7e89abcc345e8fb3f038b60b5e2de94b Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Wed, 21 Apr 2021 19:58:16 -0400 Subject: [PATCH] kv: don't allow node liveness to regress in Gossip network In #64028, we fixed a long-standing flake in `TestLeaderAfterSplit`. However, the test had actually gotten more flaky recently, which I bisected back to df826cd. The problem we occasionally see with the test is that all three replicas of a post-split Range call an election, resulting in a hung vote. Since the test is configured with RaftElectionTimeoutTicks=1000000, a follow-up election is never called, so the test times out. After some debugging, I found that the range would occasionally split while the non-leaseholder nodes (n2 and n3) thought that the leaseholder node (n1) was not live. This meant that their call to `shouldCampaignOnWake` in the split trigger considered the RHS's epoch-based lease to be invalid (state = ERROR). So all three replicas would call an election and the test would get stuck. The offending commit introduced this new flake because of this change: https://github.com/cockroachdb/cockroach/commit/df826cdf700a79948d083827ca67967016a1a1af#diff-488a090afc4b6eaf56cd6d13b347bac67cb3313ce11c49df9ee8cd95fd73b3e8R454 Now that the call to `MaybeGossipNodeLiveness` is asynchronous on the node-liveness range, it was possible for two calls to `MaybeGossipNodeLiveness` to race, one asynchronously triggered by `leasePostApplyLocked` and one synchronously triggered by `handleReadWriteLocalEvalResult` due to a node liveness update. This allowed for the following ordering of events: ``` - async call reads liveness(nid:1 epo:0 exp:0,0) - sync call writes and then reads liveness(nid:1 epo:1 exp:1619645671.921265300,0) - sync call adds liveness(nid:1 epo:1 exp:1619645671.921265300,0) to gossip - async call adds liveness(nid:1 epo:0 exp:0,0) to gossip ``` One this had occurred, n2 and n3 never again considered n1 live. Gossip never recovered from this state because the liveness record was never heartbeated again, due to the test's configuration of `RaftElectionTimeoutTicks=1000000`. This commit fixes the bug by ensuring that all calls to MaybeGossipNodeLiveness and MaybeGossipSystemConfig hold the raft mutex. This provides the necessary serialization to avoid data races, which was actually already documented on MaybeGossipSystemConfig. --- pkg/kv/kvserver/client_replica_test.go | 8 +++- pkg/kv/kvserver/replica_gossip.go | 57 +++++++++++++++----------- pkg/kv/kvserver/replica_proposal.go | 17 +++++--- 3 files changed, 51 insertions(+), 31 deletions(-) diff --git a/pkg/kv/kvserver/client_replica_test.go b/pkg/kv/kvserver/client_replica_test.go index 9e0e2dffb00f..28c7356abcf0 100644 --- a/pkg/kv/kvserver/client_replica_test.go +++ b/pkg/kv/kvserver/client_replica_test.go @@ -3211,7 +3211,9 @@ func TestStrictGCEnforcement(t *testing.T) { _, r := getFirstStoreReplica(t, s, tableKey) if _, z := r.DescAndZone(); z.GC.TTLSeconds != int32(exp) { _, sysCfg := getFirstStoreReplica(t, tc.Server(i), keys.SystemConfigSpan.Key) - require.NoError(t, sysCfg.MaybeGossipSystemConfig(ctx)) + sysCfg.RaftLock() + require.NoError(t, sysCfg.MaybeGossipSystemConfigRaftMuLocked(ctx)) + sysCfg.RaftUnlock() return errors.Errorf("expected %d, got %d", exp, z.GC.TTLSeconds) } } @@ -3225,7 +3227,9 @@ func TestStrictGCEnforcement(t *testing.T) { for i := 0; i < tc.NumServers(); i++ { s, r := getFirstStoreReplica(t, tc.Server(i), keys.SystemConfigSpan.Key) if kvserver.StrictGCEnforcement.Get(&s.ClusterSettings().SV) != val { - require.NoError(t, r.MaybeGossipSystemConfig(ctx)) + r.RaftLock() + require.NoError(t, r.MaybeGossipSystemConfigRaftMuLocked(ctx)) + r.RaftUnlock() return errors.Errorf("expected %v, got %v", val, !val) } } diff --git a/pkg/kv/kvserver/replica_gossip.go b/pkg/kv/kvserver/replica_gossip.go index 384360f2dd69..8e265d27f8b8 100644 --- a/pkg/kv/kvserver/replica_gossip.go +++ b/pkg/kv/kvserver/replica_gossip.go @@ -64,20 +64,23 @@ func (r *Replica) shouldGossip(ctx context.Context) bool { return r.OwnsValidLease(ctx, r.store.Clock().NowAsClockTimestamp()) } -// MaybeGossipSystemConfig scans the entire SystemConfig span and gossips it. -// Further calls come from the trigger on EndTxn or range lease acquisition. +// MaybeGossipSystemConfigRaftMuLocked scans the entire SystemConfig span and +// gossips it. Further calls come from the trigger on EndTxn or range lease +// acquisition. // -// Note that MaybeGossipSystemConfig gossips information only when the -// lease is actually held. The method does not request a range lease -// here since RequestLease and applyRaftCommand call the method and we -// need to avoid deadlocking in redirectOnOrAcquireLease. +// Note that MaybeGossipSystemConfigRaftMuLocked gossips information only when +// the lease is actually held. The method does not request a range lease here +// since RequestLease and applyRaftCommand call the method and we need to avoid +// deadlocking in redirectOnOrAcquireLease. // -// MaybeGossipSystemConfig must only be called from Raft commands -// (which provide the necessary serialization to avoid data races). +// MaybeGossipSystemConfigRaftMuLocked must only be called from Raft commands +// while holding the raftMu (which provide the necessary serialization to avoid +// data races). // -// TODO(nvanbenschoten,bdarnell): even though this is best effort, we -// should log louder when we continually fail to gossip system config. -func (r *Replica) MaybeGossipSystemConfig(ctx context.Context) error { +// TODO(nvanbenschoten,bdarnell): even though this is best effort, we should log +// louder when we continually fail to gossip system config. +func (r *Replica) MaybeGossipSystemConfigRaftMuLocked(ctx context.Context) error { + r.raftMu.AssertHeld() if r.store.Gossip() == nil { log.VEventf(ctx, 2, "not gossiping system config because gossip isn't initialized") return nil @@ -124,30 +127,36 @@ func (r *Replica) MaybeGossipSystemConfig(ctx context.Context) error { return nil } -// MaybeGossipSystemConfigIfHaveFailure is a trigger to gossip the system config -// due to an abort of a transaction keyed in the system config span. It will -// call MaybeGossipSystemConfig if failureToGossipSystemConfig is true. -func (r *Replica) MaybeGossipSystemConfigIfHaveFailure(ctx context.Context) error { +// MaybeGossipSystemConfigIfHaveFailureRaftMuLocked is a trigger to gossip the +// system config due to an abort of a transaction keyed in the system config +// span. It will call MaybeGossipSystemConfigRaftMuLocked if +// failureToGossipSystemConfig is true. +func (r *Replica) MaybeGossipSystemConfigIfHaveFailureRaftMuLocked(ctx context.Context) error { r.mu.RLock() failed := r.mu.failureToGossipSystemConfig r.mu.RUnlock() if !failed { return nil } - return r.MaybeGossipSystemConfig(ctx) + return r.MaybeGossipSystemConfigRaftMuLocked(ctx) } -// MaybeGossipNodeLiveness gossips information for all node liveness -// records stored on this range. To scan and gossip, this replica -// must hold the lease to a range which contains some or all of the -// node liveness records. After scanning the records, it checks -// against what's already in gossip and only gossips records which -// are out of date. -func (r *Replica) MaybeGossipNodeLiveness(ctx context.Context, span roachpb.Span) error { +// MaybeGossipNodeLivenessRaftMuLocked gossips information for all node liveness +// records stored on this range. To scan and gossip, this replica must hold the +// lease to a range which contains some or all of the node liveness records. +// After scanning the records, it checks against what's already in gossip and +// only gossips records which are out of date. +// +// MaybeGossipNodeLivenessRaftMuLocked must only be called from Raft commands +// while holding the raftMu (which provide the necessary serialization to avoid +// data races). +func (r *Replica) MaybeGossipNodeLivenessRaftMuLocked( + ctx context.Context, span roachpb.Span, +) error { + r.raftMu.AssertHeld() if r.store.Gossip() == nil || !r.IsInitialized() { return nil } - if !r.ContainsKeyRange(span.Key, span.EndKey) || !r.shouldGossip(ctx) { return nil } diff --git a/pkg/kv/kvserver/replica_proposal.go b/pkg/kv/kvserver/replica_proposal.go index 5bd7f506744f..7ce8818c310c 100644 --- a/pkg/kv/kvserver/replica_proposal.go +++ b/pkg/kv/kvserver/replica_proposal.go @@ -503,10 +503,17 @@ func (r *Replica) leasePostApplyLocked( // NB: run these in an async task to keep them out of the critical section // (r.mu is held here). _ = r.store.stopper.RunAsyncTask(ctx, "lease-triggers", func(ctx context.Context) { - if err := r.MaybeGossipSystemConfig(ctx); err != nil { + // Re-acquire the raftMu, as we are now in an async task. + r.raftMu.Lock() + defer r.raftMu.Unlock() + if _, err := r.IsDestroyed(); err != nil { + // Nothing to do. + return + } + if err := r.MaybeGossipSystemConfigRaftMuLocked(ctx); err != nil { log.Errorf(ctx, "%v", err) } - if err := r.MaybeGossipNodeLiveness(ctx, keys.NodeLivenessSpan); err != nil { + if err := r.MaybeGossipNodeLivenessRaftMuLocked(ctx, keys.NodeLivenessSpan); err != nil { log.Errorf(ctx, "%v", err) } @@ -700,21 +707,21 @@ func (r *Replica) handleReadWriteLocalEvalResult(ctx context.Context, lResult re } if lResult.MaybeGossipSystemConfig { - if err := r.MaybeGossipSystemConfig(ctx); err != nil { + if err := r.MaybeGossipSystemConfigRaftMuLocked(ctx); err != nil { log.Errorf(ctx, "%v", err) } lResult.MaybeGossipSystemConfig = false } if lResult.MaybeGossipSystemConfigIfHaveFailure { - if err := r.MaybeGossipSystemConfigIfHaveFailure(ctx); err != nil { + if err := r.MaybeGossipSystemConfigIfHaveFailureRaftMuLocked(ctx); err != nil { log.Errorf(ctx, "%v", err) } lResult.MaybeGossipSystemConfigIfHaveFailure = false } if lResult.MaybeGossipNodeLiveness != nil { - if err := r.MaybeGossipNodeLiveness(ctx, *lResult.MaybeGossipNodeLiveness); err != nil { + if err := r.MaybeGossipNodeLivenessRaftMuLocked(ctx, *lResult.MaybeGossipNodeLiveness); err != nil { log.Errorf(ctx, "%v", err) } lResult.MaybeGossipNodeLiveness = nil