From fd6c83c530cdb5135ed367d96306ee02e22132b4 Mon Sep 17 00:00:00 2001
From: Daniel Harrison <daniel.harrison@gmail.com>
Date: Wed, 26 Jun 2019 11:30:06 -0700
Subject: [PATCH] storage: truncate raft log less aggressively when replica is
 missing

Previously, we'd hold off on truncating the raft log if a replica was
missing but contactable in the last 10 seconds. This meant that if a
node was down for *more* than 10 seconds, there was a good chance that
we'd truncate logs for some of its replicas (especially for its
high-traffic ones) and it would need snapshots for them when it came
back up.

This was for two reasons. First, we've historically assumed that it's
cheaper to catch a far-behind node up with a snapshot than with entries.
Second, when we replicated the raft log truncation, we'd have to include
it in snapshots anyway, but that's changed recently.

The problem is when a node is down for a short time but more than 10
seconds (think a couple minutes). It might come back up to find that it
needs a snapshot for most ranges. We rate limit snapshots fairly
aggressively because they've been disruptive in the past, but this means
that it could potentially take hours for a node to recover from a 2
minute outage.

This would be merely unfortunate if there wasn't a second compounding
issue. A recently restarted node has a cold leaseholder cache. When it
gets traffic for one of its replicas, it first tries itself as the
leaseholder (maybe it will get lucky and won't need the network hop).
Unfortunately, if the replica needs a snapshot, this decision currently
blocks on it. This means that requests sent to the recently started node
could block for as long as the heavily-throttled snapshots take, hours
or even days.

Short outages of more than 10 seconds are reasonably common with routine
maintenance (rolling to a new version, swapping hardware, etc), so it's
likely that customers will hit this (and one did).

This commit ties the threshold that the raft log queue uses for recent
activity to `server.time_until_store_dead`, which is already the
threshold where we give up on a down node and start transferring away
its replicas. It defaults to 5 minutes, which is still short, but it's
more reasonable than 10 seconds. Crucially, it also is a cluster
setting, so it can be overridden.

We'd like to move even further in the direction of leniency about raft
log truncation while a replica is missing, but this is a compromise
that's intended to be less risky to backport to 19.1.x.

Partial mitigation for #37906

Potentially also helps with #36879

Release note (bug fix): Nodes that have been down for less than
`server.time_until_store_dead` now recover more quickly when they
rejoin.
---
 pkg/cmd/roachtest/registry.go         |  1 +
 pkg/cmd/roachtest/restart.go          | 71 +++++++++++++++++++++++++++
 pkg/storage/helpers_test.go           |  2 +-
 pkg/storage/raft_log_queue.go         | 15 ++++--
 pkg/storage/raft_log_queue_test.go    | 10 ++--
 pkg/storage/replica_proposal_quota.go |  2 +-
 pkg/storage/replica_raft.go           |  4 +-
 pkg/storage/split_delay_helper.go     |  2 +-
 8 files changed, 96 insertions(+), 11 deletions(-)
 create mode 100644 pkg/cmd/roachtest/restart.go

diff --git a/pkg/cmd/roachtest/registry.go b/pkg/cmd/roachtest/registry.go
index 72ab56d76ec8..72a3f1b4d2ff 100644
--- a/pkg/cmd/roachtest/registry.go
+++ b/pkg/cmd/roachtest/registry.go
@@ -52,6 +52,7 @@ func registerTests(r *registry) {
 	registerQueue(r)
 	registerRebalanceLoad(r)
 	registerReplicaGC(r)
+	registerRestart(r)
 	registerRestore(r)
 	registerRoachmart(r)
 	registerScaleData(r)
diff --git a/pkg/cmd/roachtest/restart.go b/pkg/cmd/roachtest/restart.go
new file mode 100644
index 000000000000..21b261e9ad52
--- /dev/null
+++ b/pkg/cmd/roachtest/restart.go
@@ -0,0 +1,71 @@
+// Copyright 2019 The Cockroach Authors.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.txt.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0, included in the file
+// licenses/APL.txt.
+
+package main
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
+)
+
+func runRestart(ctx context.Context, t *test, c *cluster, downDuration time.Duration) {
+	crdbNodes := c.Range(1, c.nodes)
+	workloadNode := c.Node(1)
+	const restartNode = 3
+
+	t.Status("installing cockroach")
+	c.Put(ctx, cockroach, "./cockroach", crdbNodes)
+	c.Start(ctx, t, crdbNodes, startArgs(`--args=--vmodule=raft_log_queue=3`))
+
+	// We don't really need tpcc, we just need a good amount of traffic and a good
+	// amount of data.
+	t.Status("importing tpcc fixture")
+	c.Run(ctx, workloadNode,
+		"./cockroach workload fixtures import tpcc --warehouses=100 --fks=false --checks=false")
+
+	// Stop a node.
+	c.Stop(ctx, c.Node(restartNode))
+
+	// Wait for between 10s and `server.time_until_store_dead` while sending
+	// traffic to one of the nodes that are not down. This used to cause lots of
+	// raft log truncation, which caused node 3 to need lots of snapshots when it
+	// came back up.
+	c.Run(ctx, workloadNode, "./cockroach workload run tpcc --warehouses=100 "+
+		fmt.Sprintf("--tolerate-errors --wait=false --duration=%s", downDuration))
+
+	// Bring it back up and make sure it can serve a query within a reasonable
+	// time limit. For now, less time than it was down for.
+	c.Start(ctx, t, c.Node(restartNode))
+	start := timeutil.Now()
+	restartNodeDB := c.Conn(ctx, restartNode)
+	if _, err := restartNodeDB.Exec(`SELECT count(*) FROM tpcc.order_line`); err != nil {
+		t.Fatal(err)
+	}
+	if took := timeutil.Since(start); took > downDuration {
+		t.Fatalf(`expected to recover within %s took %s`, downDuration, took)
+	} else {
+		c.l.Printf(`connecting and query finished in %s`, took)
+	}
+}
+
+func registerRestart(r *registry) {
+	r.Add(testSpec{
+		Name:    fmt.Sprintf("restart/down-for-2m"),
+		Cluster: makeClusterSpec(3),
+		// "cockroach workload is only in 19.1+"
+		MinVersion: "v19.1.0",
+		Run: func(ctx context.Context, t *test, c *cluster) {
+			runRestart(ctx, t, c, 2*time.Minute)
+		},
+	})
+}
diff --git a/pkg/storage/helpers_test.go b/pkg/storage/helpers_test.go
index 03468d122c85..b38f8f3be50a 100644
--- a/pkg/storage/helpers_test.go
+++ b/pkg/storage/helpers_test.go
@@ -293,7 +293,7 @@ func (r *Replica) QuotaReleaseQueueLen() int {
 func (r *Replica) IsFollowerActive(ctx context.Context, followerID roachpb.ReplicaID) bool {
 	r.mu.RLock()
 	defer r.mu.RUnlock()
-	return r.mu.lastUpdateTimes.isFollowerActive(ctx, followerID, timeutil.Now())
+	return r.mu.lastUpdateTimes.isFollowerActive(ctx, followerID, timeutil.Now(), MaxQuotaReplicaLivenessDuration)
 }
 
 func (r *Replica) CommandSizesLen() int {
diff --git a/pkg/storage/raft_log_queue.go b/pkg/storage/raft_log_queue.go
index 3306ee7380de..816ac634dc31 100644
--- a/pkg/storage/raft_log_queue.go
+++ b/pkg/storage/raft_log_queue.go
@@ -21,6 +21,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/gossip"
 	"github.com/cockroachdb/cockroach/pkg/internal/client"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/util"
 	"github.com/cockroachdb/cockroach/pkg/util/hlc"
 	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
@@ -201,7 +202,8 @@ func newTruncateDecision(ctx context.Context, r *Replica) (truncateDecision, err
 	log.Eventf(ctx, "raft status before lastUpdateTimes check: %+v", raftStatus.Progress)
 	log.Eventf(ctx, "lastUpdateTimes: %+v", r.mu.lastUpdateTimes)
 	updateRaftProgressFromActivity(
-		ctx, raftStatus.Progress, r.descRLocked().Replicas().Unwrap(), r.mu.lastUpdateTimes, now,
+		ctx, r.ClusterSettings(), raftStatus.Progress, r.descRLocked().Replicas().Unwrap(),
+		r.mu.lastUpdateTimes, now,
 	)
 	log.Eventf(ctx, "raft status after lastUpdateTimes check: %+v", raftStatus.Progress)
 	r.mu.RUnlock()
@@ -229,18 +231,25 @@ func newTruncateDecision(ctx context.Context, r *Replica) (truncateDecision, err
 
 func updateRaftProgressFromActivity(
 	ctx context.Context,
+	settings *cluster.Settings,
 	prs map[uint64]raft.Progress,
 	replicas []roachpb.ReplicaDescriptor,
 	lastUpdate lastUpdateTimesMap,
 	now time.Time,
 ) {
+	// TimeUntilStoreDead is the amount of time before we consider a store dead
+	// and start moving away its replicas. Don't truncate the raft log before then
+	// either, otherwise it may need a potentially huge number of snapshots if it
+	// comes back up. See #37906.
+	recentActiveThreshold := TimeUntilStoreDead.Get(&settings.SV)
+
 	for _, replDesc := range replicas {
 		replicaID := replDesc.ReplicaID
 		pr, ok := prs[uint64(replicaID)]
 		if !ok {
 			continue
 		}
-		pr.RecentActive = lastUpdate.isFollowerActive(ctx, replicaID, now)
+		pr.RecentActive = lastUpdate.isFollowerActive(ctx, replicaID, now, recentActiveThreshold)
 		// Override this field for safety since we don't use it. Instead, we use
 		// pendingSnapshotIndex from above which is also populated for
 		// preemptive snapshots.
@@ -493,7 +502,7 @@ func (rlq *raftLogQueue) shouldQueue(
 }
 
 // shouldQueueImpl returns whether the given truncate decision should lead to
-// a log truncation. This is either the case if the decision says so or of
+// a log truncation. This is either the case if the decision says so or if
 // we want to recompute the log size (in which case `recomputeRaftLogSize` and
 // `shouldQ` are both true and a reasonable priority is returned).
 func (rlq *raftLogQueue) shouldQueueImpl(
diff --git a/pkg/storage/raft_log_queue_test.go b/pkg/storage/raft_log_queue_test.go
index f0f220dde6e5..c2449163ed7b 100644
--- a/pkg/storage/raft_log_queue_test.go
+++ b/pkg/storage/raft_log_queue_test.go
@@ -22,6 +22,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/internal/client"
 	"github.com/cockroachdb/cockroach/pkg/keys"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/storage/engine"
 	"github.com/cockroachdb/cockroach/pkg/testutils"
 	"github.com/cockroachdb/cockroach/pkg/util/hlc"
@@ -373,6 +374,9 @@ func verifyLogSizeInSync(t *testing.T, r *Replica) {
 func TestUpdateRaftStatusActivity(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 
+	settings := cluster.MakeTestingClusterSettings()
+	recentActiveThreshold := TimeUntilStoreDead.Get(&settings.SV)
+
 	type testCase struct {
 		prs        []raft.Progress
 		replicas   []roachpb.ReplicaDescriptor
@@ -403,8 +407,8 @@ func TestUpdateRaftStatusActivity(t *testing.T) {
 			replicas: []roachpb.ReplicaDescriptor{{ReplicaID: 1}, {ReplicaID: 2}, {ReplicaID: 3}},
 			prs:      []raft.Progress{{RecentActive: false}, {RecentActive: true}},
 			lastUpdate: map[roachpb.ReplicaID]time.Time{
-				1: now.Add(-1 * MaxQuotaReplicaLivenessDuration / 2),
-				2: now.Add(-1 - MaxQuotaReplicaLivenessDuration),
+				1: now.Add(-1 * recentActiveThreshold / 2),
+				2: now.Add(-1 - recentActiveThreshold),
 				3: now,
 			},
 			now: now,
@@ -425,7 +429,7 @@ func TestUpdateRaftStatusActivity(t *testing.T) {
 			for i, pr := range tc.exp {
 				expPRs[uint64(i+1)] = pr
 			}
-			updateRaftProgressFromActivity(ctx, prs, tc.replicas, tc.lastUpdate, tc.now)
+			updateRaftProgressFromActivity(ctx, settings, prs, tc.replicas, tc.lastUpdate, tc.now)
 			assert.Equal(t, expPRs, prs)
 		})
 	}
diff --git a/pkg/storage/replica_proposal_quota.go b/pkg/storage/replica_proposal_quota.go
index aeb94a1dde77..ff5d9c54dc0b 100644
--- a/pkg/storage/replica_proposal_quota.go
+++ b/pkg/storage/replica_proposal_quota.go
@@ -160,7 +160,7 @@ func (r *Replica) updateProposalQuotaRaftMuLocked(
 		}
 
 		// Only consider followers that are active.
-		if !r.mu.lastUpdateTimes.isFollowerActive(ctx, rep.ReplicaID, now) {
+		if !r.mu.lastUpdateTimes.isFollowerActive(ctx, rep.ReplicaID, now, MaxQuotaReplicaLivenessDuration) {
 			return
 		}
 
diff --git a/pkg/storage/replica_raft.go b/pkg/storage/replica_raft.go
index eee4b4ff0aa5..89d31a34cd13 100644
--- a/pkg/storage/replica_raft.go
+++ b/pkg/storage/replica_raft.go
@@ -1717,7 +1717,7 @@ func (m lastUpdateTimesMap) updateOnBecomeLeader(descs []roachpb.ReplicaDescript
 // isFollowerActive returns whether the specified follower has made
 // communication with the leader in the last MaxQuotaReplicaLivenessDuration.
 func (m lastUpdateTimesMap) isFollowerActive(
-	ctx context.Context, replicaID roachpb.ReplicaID, now time.Time,
+	ctx context.Context, replicaID roachpb.ReplicaID, now time.Time, threshold time.Duration,
 ) bool {
 	lastUpdateTime, ok := m[replicaID]
 	if !ok {
@@ -1726,7 +1726,7 @@ func (m lastUpdateTimesMap) isFollowerActive(
 		// replicas were updated).
 		return false
 	}
-	return now.Sub(lastUpdateTime) <= MaxQuotaReplicaLivenessDuration
+	return now.Sub(lastUpdateTime) <= threshold
 }
 
 // processRaftCommand handles the complexities involved in moving the Raft
diff --git a/pkg/storage/split_delay_helper.go b/pkg/storage/split_delay_helper.go
index 2a451d232f62..f8bfd6b1794b 100644
--- a/pkg/storage/split_delay_helper.go
+++ b/pkg/storage/split_delay_helper.go
@@ -35,7 +35,7 @@ func (sdh *splitDelayHelper) RaftStatus(ctx context.Context) (roachpb.RangeID, *
 	raftStatus := r.raftStatusRLocked()
 	if raftStatus != nil {
 		updateRaftProgressFromActivity(
-			ctx, raftStatus.Progress, r.descRLocked().Replicas().Unwrap(), r.mu.lastUpdateTimes, timeutil.Now(),
+			ctx, r.ClusterSettings(), raftStatus.Progress, r.descRLocked().Replicas().Unwrap(), r.mu.lastUpdateTimes, timeutil.Now(),
 		)
 	}
 	r.mu.RUnlock()