From 414d4644d71e95e1fa0a9f22a65361d16f380b35 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Thu, 5 Mar 2020 12:42:38 +0100
Subject: [PATCH] storage: trigger GC based on SysCount/SysBytes

If there is "a lot" of data in Sys{Bytes,Count}, then we are likely
experiencing a large abort span. The abort span is not supposed to
become that large, but it does happen and causes stability fallout,
usually due to a combination of shortcomings:

1. there's no trigger for GC based on abort span size alone (before
   this commit)
2. transaction aborts tended to create unnecessary abort span entries,
   fixed (and 19.2-backported) in:
   https://github.com/cockroachdb/cockroach/pull/42765
3. aborting transactions in a busy loop:
   https://github.com/cockroachdb/cockroach/issues/38088
   (and we suspect this also happens in user apps occasionally)
4. large snapshots would never complete due to the queue time limits
   (addressed in https://github.com/cockroachdb/cockroach/pull/44952).

In an ideal world, we would factor in the abort span into this method
directly, but until then the condition guarding this block will do.
At worst, there is some other reason for SysBytes become that large
while also incurring a large SysCount, but I'm not sure how this
would happen. The only other things in this span are the versioned
range descriptors (which follow regular GC and it's only ever adding
a SysCount of one), or transaction records (which expire like abort
span records).

Release note (bug fix): Range garbage collection will now trigger
based on a large abort span, adding defense-in-depth against ranges
growing large (and eventually unstable).
---
 pkg/storage/gc_queue.go      | 49 ++++++++++++++++++++++++++++++------
 pkg/storage/gc_queue_test.go | 40 ++++++++++++++++++++++++++---
 2 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/pkg/storage/gc_queue.go b/pkg/storage/gc_queue.go
index 32146e25e4a0..ae85b38e5c0a 100644
--- a/pkg/storage/gc_queue.go
+++ b/pkg/storage/gc_queue.go
@@ -56,8 +56,34 @@ const (
 	// gcKeyVersionChunkBytes is the threshold size for splitting
 	// GCRequests into multiple batches.
 	gcKeyVersionChunkBytes = base.ChunkRaftCommandThresholdBytes
+
+	probablyLargeAbortSpanSysCountThreshold = 10000
+	probablyLargeAbortSpanSysBytesThreshold = 16 * (1 << 20) // 16mb
 )
 
+func probablyLargeAbortSpan(ms enginepb.MVCCStats) bool {
+	// If there is "a lot" of data in Sys{Bytes,Count}, then we are likely
+	// experiencing a large abort span. The abort span is not supposed to
+	// become that large, but it does happen and causes stability fallout,
+	// usually due to a combination of shortcomings:
+	//
+	// 1. there's no trigger for GC based on abort span size alone (before
+	//    this code block here was written)
+	// 2. transaction aborts tended to create unnecessary abort span entries,
+	//    fixed (and 19.2-backported) in:
+	//    https://github.com/cockroachdb/cockroach/pull/42765
+	// 3. aborting transactions in a busy loop:
+	//    https://github.com/cockroachdb/cockroach/issues/38088
+	//    (and we suspect this also happens in user apps occasionally)
+	// 4. large snapshots would never complete due to the queue time limits
+	//    (addressed in https://github.com/cockroachdb/cockroach/pull/44952).
+	//
+	// In an ideal world, we would factor in the abort span into this method
+	// directly, but until then the condition guarding this block will do.
+	return ms.SysCount >= probablyLargeAbortSpanSysCountThreshold &&
+		ms.SysBytes >= probablyLargeAbortSpanSysBytesThreshold
+}
+
 // gcQueue manages a queue of replicas slated to be scanned in their
 // entirety using the MVCC versions iterator. The gc queue manages the
 // following tasks:
@@ -165,12 +191,8 @@ func makeGCQueueScore(
 	// Use desc.RangeID for fuzzing the final score, so that different ranges
 	// have slightly different priorities and even symmetrical workloads don't
 	// trigger GC at the same time.
-	r := makeGCQueueScoreImpl(
-		ctx, int64(desc.RangeID), now, ms, zone.GC.TTLSeconds,
-	)
-	if (gcThreshold != hlc.Timestamp{}) {
-		r.LikelyLastGC = time.Duration(now.WallTime - gcThreshold.Add(r.TTL.Nanoseconds(), 0).WallTime)
-	}
+	r := makeGCQueueScoreImpl(ctx, int64(desc.RangeID), now, ms, zone.GC.TTLSeconds, gcThreshold)
+
 	return r
 }
 
@@ -263,10 +285,18 @@ func makeGCQueueScore(
 // ttl*GCBytes`, and that a decent trigger for GC is a multiple of
 // `ttl*GCBytes`.
 func makeGCQueueScoreImpl(
-	ctx context.Context, fuzzSeed int64, now hlc.Timestamp, ms enginepb.MVCCStats, ttlSeconds int32,
+	ctx context.Context,
+	fuzzSeed int64,
+	now hlc.Timestamp,
+	ms enginepb.MVCCStats,
+	ttlSeconds int32,
+	gcThreshold hlc.Timestamp,
 ) gcQueueScore {
 	ms.Forward(now.WallTime)
 	var r gcQueueScore
+	if (gcThreshold != hlc.Timestamp{}) {
+		r.LikelyLastGC = time.Duration(now.WallTime - gcThreshold.Add(r.TTL.Nanoseconds(), 0).WallTime)
+	}
 	r.TTL = time.Duration(ttlSeconds) * time.Second
 
 	// Treat a zero TTL as a one-second TTL, which avoids a priority of infinity
@@ -329,6 +359,11 @@ func makeGCQueueScoreImpl(
 	r.ShouldQueue = r.FuzzFactor*valScore > gcKeyScoreThreshold || r.FuzzFactor*r.IntentScore > gcIntentScoreThreshold
 	r.FinalScore = r.FuzzFactor * (valScore + r.IntentScore)
 
+	if probablyLargeAbortSpan(ms) && !r.ShouldQueue &&
+		(r.LikelyLastGC == 0 || r.LikelyLastGC > storagebase.TxnCleanupThreshold) {
+		r.ShouldQueue = true
+		r.FinalScore++
+	}
 	return r
 }
 
diff --git a/pkg/storage/gc_queue_test.go b/pkg/storage/gc_queue_test.go
index 6ae3c8256f56..729bd99c4f6b 100644
--- a/pkg/storage/gc_queue_test.go
+++ b/pkg/storage/gc_queue_test.go
@@ -36,6 +36,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/uuid"
 	"github.com/kr/pretty"
 	"github.com/pkg/errors"
+	"github.com/stretchr/testify/require"
 	"golang.org/x/sync/syncmap"
 )
 
@@ -103,7 +104,7 @@ func TestGCQueueMakeGCScoreInvariantQuick(t *testing.T) {
 			GCBytesAge:      gcByteAge,
 		}
 		now := initialNow.Add(timePassed.Nanoseconds(), 0)
-		r := makeGCQueueScoreImpl(ctx, int64(seed), now, ms, ttlSec)
+		r := makeGCQueueScoreImpl(ctx, int64(seed), now, ms, ttlSec, hlc.Timestamp{})
 		wouldHaveToDeleteSomething := gcBytes*int64(ttlSec) < ms.GCByteAge(now.WallTime)
 		result := !r.ShouldQueue || wouldHaveToDeleteSomething
 		if !result {
@@ -124,13 +125,46 @@ func TestGCQueueMakeGCScoreAnomalousStats(t *testing.T) {
 			LiveBytes:         int64(liveBytes),
 			ValBytes:          int64(valBytes),
 			KeyBytes:          int64(keyBytes),
-		}, 60)
+		}, 60, hlc.Timestamp{})
 		return r.DeadFraction >= 0 && r.DeadFraction <= 1
 	}, &quick.Config{MaxCount: 1000}); err != nil {
 		t.Fatal(err)
 	}
 }
 
+func TestGCQueueMakeGCScoreLargeAbortSpan(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	const seed = 1
+	var ms enginepb.MVCCStats
+	ms.SysCount += probablyLargeAbortSpanSysCountThreshold
+	ms.SysBytes += probablyLargeAbortSpanSysBytesThreshold
+
+	gcThresh := hlc.Timestamp{WallTime: 1}
+	expiration := storagebase.TxnCleanupThreshold.Nanoseconds() + 1
+	const ttlSec = 10000
+
+	// GC triggered if abort span should all be gc'able and it's likely large.
+	{
+		r := makeGCQueueScoreImpl(
+			context.Background(), seed,
+			hlc.Timestamp{WallTime: expiration + 1},
+			ms, ttlSec, gcThresh,
+		)
+		require.True(t, r.ShouldQueue)
+		require.NotZero(t, r.FinalScore)
+	}
+
+	// Heuristic doesn't fire if likely last GC within TxnCleanupThreshold.
+	{
+		r := makeGCQueueScoreImpl(context.Background(), seed,
+			hlc.Timestamp{WallTime: expiration},
+			ms, ttlSec, gcThresh,
+		)
+		require.False(t, r.ShouldQueue)
+		require.Zero(t, r.FinalScore)
+	}
+}
+
 const cacheFirstLen = 3
 
 type gcTestCacheKey struct {
@@ -244,7 +278,7 @@ func (cws *cachedWriteSimulator) shouldQueue(
 ) {
 	cws.t.Helper()
 	ts := hlc.Timestamp{}.Add(ms.LastUpdateNanos+after.Nanoseconds(), 0)
-	r := makeGCQueueScoreImpl(context.Background(), 0 /* seed */, ts, ms, int32(ttl.Seconds()))
+	r := makeGCQueueScoreImpl(context.Background(), 0 /* seed */, ts, ms, int32(ttl.Seconds()), hlc.Timestamp{})
 	if fmt.Sprintf("%.2f", r.FinalScore) != fmt.Sprintf("%.2f", prio) || b != r.ShouldQueue {
 		cws.t.Errorf("expected queued=%t (is %t), prio=%.2f, got %.2f: after=%s, ttl=%s:\nms: %+v\nscore: %s",
 			b, r.ShouldQueue, prio, r.FinalScore, after, ttl, ms, r)