From 2bd28c7b686a4edd3ca346c9e1f05751aaa551c9 Mon Sep 17 00:00:00 2001
From: Daniel Harrison <daniel.harrison@gmail.com>
Date: Wed, 26 Jun 2019 11:30:06 -0700
Subject: [PATCH] storage: truncate raft log less aggressively when replica is
 missing

Previously, we'd hold off on truncating the raft log if a replica was
missing but contactable in the last 10 seconds. This meant that if a
node was down for *more* than 10 seconds, there was a good chance that
we'd truncate logs for some of its replicas (especially for its
high-traffic ones) and it would need snapshots for them when it came
back up.

This was for two reasons. First, we've historically assumed that it's
cheaper to catch a far-behind node up with a snapshot than with entries.
Second, snapshots historically had to include the Raft log which implied
a need to keep the size of the Raft log tightly controlled due to being
pulled into memory at the snapshot receiver, but that's changed
recently.

The problem is when a node is down for longer than 10 seconds but
shorter than the time it takes to upreplicate all of its ranges onto new
nodes. It might come back up to find that it needs a snapshot for most
ranges. We rate limit snapshots fairly aggressively because they've been
disruptive in the past, but this means that it could potentially take
hours for a node to recover from a 2 minute outage.

This would be merely unfortunate if there wasn't a second compounding
issue. A recently restarted node has a cold leaseholder cache. When it
gets traffic for one of its replicas, it first tries itself as the
leaseholder (maybe it will get lucky and won't need the network hop).
Unfortunately, if the replica needs a snapshot, this decision currently
blocks on it. This means that requests sent to the recently started node
could block for as long as the heavily-throttled snapshots take, hours
or even days.

Short outages of more than 10 seconds are reasonably common with routine
maintenance (rolling to a new version, swapping hardware, etc), so it's
likely that customers will hit this (and one did).

This commit avoids truncating the log past any follower's position when
all replicas have recently been active (the quota pool keeps it from
growing without bound in this case). If at least one replica hasn't
recently been active, it holds off any truncation until the log reaches
a size threshold.

Partial mitigation for #37906

Potentially also helps with #36879

Release note (bug fix): Nodes that have been down for less than
`server.time_until_store_dead` now recover more quickly when they
rejoin.
---
 pkg/base/config.go                 |  6 +-
 pkg/cmd/roachtest/registry.go      |  1 +
 pkg/cmd/roachtest/restart.go       | 71 ++++++++++++++++++++++++
 pkg/storage/raft_log_queue.go      | 89 +++++++++++++++++-------------
 pkg/storage/raft_log_queue_test.go | 16 +++---
 5 files changed, 134 insertions(+), 49 deletions(-)
 create mode 100644 pkg/cmd/roachtest/restart.go

diff --git a/pkg/base/config.go b/pkg/base/config.go
index da9dd7a0f678..0dedf6e7c21f 100644
--- a/pkg/base/config.go
+++ b/pkg/base/config.go
@@ -97,8 +97,10 @@ var (
 		"COCKROACH_RAFT_ELECTION_TIMEOUT_TICKS", 15)
 
 	// defaultRaftLogTruncationThreshold specifies the upper bound that a single
-	// Range's Raft log can grow to before log truncations are triggered, even
-	// if that means a snapshot will be required for a straggling follower.
+	// Range's Raft log can grow to before log truncations are triggered while at
+	// least one follower is missing. If all followers are active, the quota pool
+	// is responsible for ensuring the raft log doesn't grow without bound by
+	// making sure the leader doesn't get too far ahead.
 	defaultRaftLogTruncationThreshold = envutil.EnvOrDefaultInt64(
 		"COCKROACH_RAFT_LOG_TRUNCATION_THRESHOLD", 4<<20 /* 4 MB */)
 
diff --git a/pkg/cmd/roachtest/registry.go b/pkg/cmd/roachtest/registry.go
index 72ab56d76ec8..72a3f1b4d2ff 100644
--- a/pkg/cmd/roachtest/registry.go
+++ b/pkg/cmd/roachtest/registry.go
@@ -52,6 +52,7 @@ func registerTests(r *registry) {
 	registerQueue(r)
 	registerRebalanceLoad(r)
 	registerReplicaGC(r)
+	registerRestart(r)
 	registerRestore(r)
 	registerRoachmart(r)
 	registerScaleData(r)
diff --git a/pkg/cmd/roachtest/restart.go b/pkg/cmd/roachtest/restart.go
new file mode 100644
index 000000000000..21b261e9ad52
--- /dev/null
+++ b/pkg/cmd/roachtest/restart.go
@@ -0,0 +1,71 @@
+// Copyright 2019 The Cockroach Authors.
+//
+// Use of this software is governed by the Business Source License
+// included in the file licenses/BSL.txt.
+//
+// As of the Change Date specified in that file, in accordance with
+// the Business Source License, use of this software will be governed
+// by the Apache License, Version 2.0, included in the file
+// licenses/APL.txt.
+
+package main
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
+)
+
+func runRestart(ctx context.Context, t *test, c *cluster, downDuration time.Duration) {
+	crdbNodes := c.Range(1, c.nodes)
+	workloadNode := c.Node(1)
+	const restartNode = 3
+
+	t.Status("installing cockroach")
+	c.Put(ctx, cockroach, "./cockroach", crdbNodes)
+	c.Start(ctx, t, crdbNodes, startArgs(`--args=--vmodule=raft_log_queue=3`))
+
+	// We don't really need tpcc, we just need a good amount of traffic and a good
+	// amount of data.
+	t.Status("importing tpcc fixture")
+	c.Run(ctx, workloadNode,
+		"./cockroach workload fixtures import tpcc --warehouses=100 --fks=false --checks=false")
+
+	// Stop a node.
+	c.Stop(ctx, c.Node(restartNode))
+
+	// Wait for between 10s and `server.time_until_store_dead` while sending
+	// traffic to one of the nodes that are not down. This used to cause lots of
+	// raft log truncation, which caused node 3 to need lots of snapshots when it
+	// came back up.
+	c.Run(ctx, workloadNode, "./cockroach workload run tpcc --warehouses=100 "+
+		fmt.Sprintf("--tolerate-errors --wait=false --duration=%s", downDuration))
+
+	// Bring it back up and make sure it can serve a query within a reasonable
+	// time limit. For now, less time than it was down for.
+	c.Start(ctx, t, c.Node(restartNode))
+	start := timeutil.Now()
+	restartNodeDB := c.Conn(ctx, restartNode)
+	if _, err := restartNodeDB.Exec(`SELECT count(*) FROM tpcc.order_line`); err != nil {
+		t.Fatal(err)
+	}
+	if took := timeutil.Since(start); took > downDuration {
+		t.Fatalf(`expected to recover within %s took %s`, downDuration, took)
+	} else {
+		c.l.Printf(`connecting and query finished in %s`, took)
+	}
+}
+
+func registerRestart(r *registry) {
+	r.Add(testSpec{
+		Name:    fmt.Sprintf("restart/down-for-2m"),
+		Cluster: makeClusterSpec(3),
+		// "cockroach workload is only in 19.1+"
+		MinVersion: "v19.1.0",
+		Run: func(ctx context.Context, t *test, c *cluster) {
+			runRestart(ctx, t, c, 2*time.Minute)
+		},
+	})
+}
diff --git a/pkg/storage/raft_log_queue.go b/pkg/storage/raft_log_queue.go
index 3306ee7380de..e076d1e62915 100644
--- a/pkg/storage/raft_log_queue.go
+++ b/pkg/storage/raft_log_queue.go
@@ -355,6 +355,16 @@ func (td *truncateDecision) ShouldTruncate() bool {
 		(n > 0 && td.Input.LogSize >= RaftLogQueueStaleSize)
 }
 
+// EnsureNotPastIndex lowers the truncation point to the given index if it would
+// be truncating at a point past it. If a change is made, the ChosenVia is
+// updated with the given one.
+func (td *truncateDecision) EnsureNotPastIndex(index uint64, chosenVia string) {
+	if td.NewFirstIndex > index {
+		td.NewFirstIndex = index
+		td.ChosenVia = chosenVia
+	}
+}
+
 // computeTruncateDecision returns the oldest index that cannot be
 // truncated. If there is a behind node, we want to keep old raft logs so it
 // can catch up without having to send a full snapshot. However, if a node down
@@ -380,50 +390,54 @@ func computeTruncateDecision(input truncateDecisionInput) truncateDecision {
 	decision.ChosenVia = truncatableIndexChosenViaQuorumIndex
 
 	for _, progress := range input.RaftStatus.Progress {
-		if !progress.RecentActive {
-			// If a follower isn't recently active, don't lower the truncation
-			// index for it as the follower is likely not online at all and would
-			// block log truncation forever.
-			continue
-		}
+		// Snapshots are expensive, so we try our best to avoid truncating past
+		// where a follower is.
 
-		// Generally we truncate to the quorum commit index when the log becomes
-		// too large, but we make an exception for live followers which are
-		// being probed (i.e. the leader doesn't know how far they've caught
-		// up). In that case the Match index is too large, and so the quorum
-		// index can be, too. We don't want these followers to require a
-		// snapshot since they are most likely going to be caught up very soon
-		// (they respond with the "right index" to the first probe or don't
-		// respond, in which case they should end up as not recently active).
-		// But we also don't know their index, so we can't possible make a
-		// truncation decision that avoids that at this point and make the
-		// truncation a no-op.
+		// First, we never truncate off a recently active follower, no matter how
+		// large the log gets. Recently active shares the (currently 10s) constant
+		// as the quota pool, so the quota pool should put a bound on how much the
+		// raft log can grow due to this.
+		//
+		// For live followers which are being probed (i.e. the leader doesn't know
+		// how far they've caught up), the Match index is too large, and so the
+		// quorum index can be, too. We don't want these followers to require a
+		// snapshot since they are most likely going to be caught up very soon (they
+		// respond with the "right index" to the first probe or don't respond, in
+		// which case they should end up as not recently active). But we also don't
+		// know their index, so we can't possible make a truncation decision that
+		// avoids that at this point and make the truncation a no-op.
 		//
-		// The scenario in which this is most relevant is during restores, where
-		// we split off new ranges that rapidly receive very large log entries
-		// while the Raft group is still in a state of discovery (a new leader
-		// starts probing followers at its own last index). Additionally, these
-		// ranges will be split many times over, resulting in a flurry of
-		// snapshots with overlapping bounds that put significant stress on the
-		// Raft snapshot queue.
-		if progress.State == raft.ProgressStateProbe {
-			if decision.NewFirstIndex > decision.Input.FirstIndex {
-				decision.NewFirstIndex = decision.Input.FirstIndex
-				decision.ChosenVia = truncatableIndexChosenViaProbingFollower
+		// The scenario in which this is most relevant is during restores, where we
+		// split off new ranges that rapidly receive very large log entries while
+		// the Raft group is still in a state of discovery (a new leader starts
+		// probing followers at its own last index). Additionally, these ranges will
+		// be split many times over, resulting in a flurry of snapshots with
+		// overlapping bounds that put significant stress on the Raft snapshot
+		// queue.
+		if progress.RecentActive {
+			if progress.State == raft.ProgressStateProbe {
+				decision.EnsureNotPastIndex(decision.Input.FirstIndex, truncatableIndexChosenViaProbingFollower)
+			} else {
+				decision.EnsureNotPastIndex(progress.Match, truncatableIndexChosenViaFollowers)
 			}
-		} else if !input.LogTooLarge() && decision.NewFirstIndex > progress.Match {
-			decision.NewFirstIndex = progress.Match
-			decision.ChosenVia = truncatableIndexChosenViaFollowers
+			continue
+		}
+
+		// Second, if the follower has not been recently active, we don't
+		// truncate it off as long as the raft log is not too large.
+		if !input.LogTooLarge() {
+			decision.EnsureNotPastIndex(progress.Match, truncatableIndexChosenViaFollowers)
 		}
+
+		// Otherwise, we let it truncate to the quorum index.
 	}
 
 	// The pending snapshot index acts as a placeholder for a replica that is
 	// about to be added to the range (or is in Raft recovery). We don't want to
 	// truncate the log in a way that will require that new replica to be caught
 	// up via yet another Raft snapshot.
-	if input.PendingPreemptiveSnapshotIndex > 0 && decision.NewFirstIndex > input.PendingPreemptiveSnapshotIndex {
-		decision.NewFirstIndex = input.PendingPreemptiveSnapshotIndex
-		decision.ChosenVia = truncatableIndexChosenViaPendingSnap
+	if input.PendingPreemptiveSnapshotIndex > 0 {
+		decision.EnsureNotPastIndex(input.PendingPreemptiveSnapshotIndex, truncatableIndexChosenViaPendingSnap)
 	}
 
 	// Advance to the first index, but never truncate past the quorum commit
@@ -437,10 +451,7 @@ func computeTruncateDecision(input truncateDecisionInput) truncateDecision {
 	// updated on the leader when a command is proposed and in a single replica
 	// Raft group this also means that RaftStatus.Commit is updated at propose
 	// time.
-	if decision.NewFirstIndex > input.LastIndex {
-		decision.NewFirstIndex = input.LastIndex
-		decision.ChosenVia = truncatableIndexChosenViaLastIndex
-	}
+	decision.EnsureNotPastIndex(input.LastIndex, truncatableIndexChosenViaLastIndex)
 
 	// If new first index dropped below first index, make them equal (resulting
 	// in a no-op).
@@ -493,7 +504,7 @@ func (rlq *raftLogQueue) shouldQueue(
 }
 
 // shouldQueueImpl returns whether the given truncate decision should lead to
-// a log truncation. This is either the case if the decision says so or of
+// a log truncation. This is either the case if the decision says so or if
 // we want to recompute the log size (in which case `recomputeRaftLogSize` and
 // `shouldQ` are both true and a reasonable priority is returned).
 func (rlq *raftLogQueue) shouldQueueImpl(
diff --git a/pkg/storage/raft_log_queue_test.go b/pkg/storage/raft_log_queue_test.go
index f0f220dde6e5..2fe798b5cf83 100644
--- a/pkg/storage/raft_log_queue_test.go
+++ b/pkg/storage/raft_log_queue_test.go
@@ -170,10 +170,10 @@ func TestComputeTruncateDecision(t *testing.T) {
 			[]uint64{1, 2, 3, 4}, 100, 2, 2, 0,
 			"should truncate: false [truncate 0 entries to first index 2 (chosen via: first index)]",
 		},
-		// If over targetSize, should truncate to quorum committed index. Minority will need snapshots.
+		// Don't truncate off active followers, even if over targetSize.
 		{
 			[]uint64{1, 3, 3, 4}, 2000, 1, 3, 0,
-			"should truncate: false [truncate 2 entries to first index 3 (chosen via: quorum); log too large (2.0 KiB > 1000 B); implies 1 Raft snapshot]",
+			"should truncate: false [truncate 0 entries to first index 1 (chosen via: followers); log too large (2.0 KiB > 1000 B)]",
 		},
 		// Don't truncate away pending snapshot, even when log too large.
 		{
@@ -182,11 +182,11 @@ func TestComputeTruncateDecision(t *testing.T) {
 		},
 		{
 			[]uint64{1, 3, 3, 4}, 2000, 2, 3, 0,
-			"should truncate: false [truncate 1 entries to first index 3 (chosen via: quorum); log too large (2.0 KiB > 1000 B)]",
+			"should truncate: false [truncate 0 entries to first index 2 (chosen via: first index); log too large (2.0 KiB > 1000 B)]",
 		},
 		{
 			[]uint64{1, 3, 3, 4}, 2000, 3, 3, 0,
-			"should truncate: false [truncate 0 entries to first index 3 (chosen via: quorum); log too large (2.0 KiB > 1000 B)]",
+			"should truncate: false [truncate 0 entries to first index 3 (chosen via: first index); log too large (2.0 KiB > 1000 B)]",
 		},
 		// The pending snapshot index affects the quorum commit index.
 		{
@@ -209,7 +209,7 @@ func TestComputeTruncateDecision(t *testing.T) {
 			"should truncate: false [truncate 0 entries to first index 2 (chosen via: first index)]",
 		}}
 	for i, c := range testCases {
-		t.Run(fmt.Sprintf("%+v", c), func(t *testing.T) {
+		t.Run("", func(t *testing.T) {
 			status := raft.Status{
 				Progress: make(map[uint64]raft.Progress),
 			}
@@ -227,7 +227,7 @@ func TestComputeTruncateDecision(t *testing.T) {
 			}
 			decision := computeTruncateDecision(input)
 			if act, exp := decision.String(), c.exp; act != exp {
-				t.Errorf("%d: got:\n%s\nwanted:\n%s", i, act, exp)
+				t.Errorf("%d:\ngot:\n%s\nwanted:\n%s", i, act, exp)
 			}
 
 			// Verify the triggers that queue a range for recomputation. In
@@ -266,11 +266,11 @@ func TestComputeTruncateDecisionProgressStatusProbe(t *testing.T) {
 	exp := map[bool]map[bool]string{ // (tooLarge, active)
 		false: {
 			true:  "should truncate: false [truncate 0 entries to first index 10 (chosen via: probing follower)]",
-			false: "should truncate: true [truncate 190 entries to first index 200 (chosen via: followers)]",
+			false: "should truncate: false [truncate 0 entries to first index 10 (chosen via: first index)]",
 		},
 		true: {
 			true:  "should truncate: false [truncate 0 entries to first index 10 (chosen via: probing follower); log too large (2.0 KiB > 1.0 KiB)]",
-			false: "should truncate: true [truncate 290 entries to first index 300 (chosen via: quorum); log too large (2.0 KiB > 1.0 KiB); implies 1 Raft snapshot]",
+			false: "should truncate: true [truncate 190 entries to first index 200 (chosen via: followers); log too large (2.0 KiB > 1.0 KiB)]",
 		},
 	}