diff --git a/pkg/storage/gc_queue.go b/pkg/storage/gc_queue.go index 32146e25e4a0..ae85b38e5c0a 100644 --- a/pkg/storage/gc_queue.go +++ b/pkg/storage/gc_queue.go @@ -56,8 +56,34 @@ const ( // gcKeyVersionChunkBytes is the threshold size for splitting // GCRequests into multiple batches. gcKeyVersionChunkBytes = base.ChunkRaftCommandThresholdBytes + + probablyLargeAbortSpanSysCountThreshold = 10000 + probablyLargeAbortSpanSysBytesThreshold = 16 * (1 << 20) // 16mb ) +func probablyLargeAbortSpan(ms enginepb.MVCCStats) bool { + // If there is "a lot" of data in Sys{Bytes,Count}, then we are likely + // experiencing a large abort span. The abort span is not supposed to + // become that large, but it does happen and causes stability fallout, + // usually due to a combination of shortcomings: + // + // 1. there's no trigger for GC based on abort span size alone (before + // this code block here was written) + // 2. transaction aborts tended to create unnecessary abort span entries, + // fixed (and 19.2-backported) in: + // https://github.com/cockroachdb/cockroach/pull/42765 + // 3. aborting transactions in a busy loop: + // https://github.com/cockroachdb/cockroach/issues/38088 + // (and we suspect this also happens in user apps occasionally) + // 4. large snapshots would never complete due to the queue time limits + // (addressed in https://github.com/cockroachdb/cockroach/pull/44952). + // + // In an ideal world, we would factor in the abort span into this method + // directly, but until then the condition guarding this block will do. + return ms.SysCount >= probablyLargeAbortSpanSysCountThreshold && + ms.SysBytes >= probablyLargeAbortSpanSysBytesThreshold +} + // gcQueue manages a queue of replicas slated to be scanned in their // entirety using the MVCC versions iterator. The gc queue manages the // following tasks: @@ -165,12 +191,8 @@ func makeGCQueueScore( // Use desc.RangeID for fuzzing the final score, so that different ranges // have slightly different priorities and even symmetrical workloads don't // trigger GC at the same time. - r := makeGCQueueScoreImpl( - ctx, int64(desc.RangeID), now, ms, zone.GC.TTLSeconds, - ) - if (gcThreshold != hlc.Timestamp{}) { - r.LikelyLastGC = time.Duration(now.WallTime - gcThreshold.Add(r.TTL.Nanoseconds(), 0).WallTime) - } + r := makeGCQueueScoreImpl(ctx, int64(desc.RangeID), now, ms, zone.GC.TTLSeconds, gcThreshold) + return r } @@ -263,10 +285,18 @@ func makeGCQueueScore( // ttl*GCBytes`, and that a decent trigger for GC is a multiple of // `ttl*GCBytes`. func makeGCQueueScoreImpl( - ctx context.Context, fuzzSeed int64, now hlc.Timestamp, ms enginepb.MVCCStats, ttlSeconds int32, + ctx context.Context, + fuzzSeed int64, + now hlc.Timestamp, + ms enginepb.MVCCStats, + ttlSeconds int32, + gcThreshold hlc.Timestamp, ) gcQueueScore { ms.Forward(now.WallTime) var r gcQueueScore + if (gcThreshold != hlc.Timestamp{}) { + r.LikelyLastGC = time.Duration(now.WallTime - gcThreshold.Add(r.TTL.Nanoseconds(), 0).WallTime) + } r.TTL = time.Duration(ttlSeconds) * time.Second // Treat a zero TTL as a one-second TTL, which avoids a priority of infinity @@ -329,6 +359,11 @@ func makeGCQueueScoreImpl( r.ShouldQueue = r.FuzzFactor*valScore > gcKeyScoreThreshold || r.FuzzFactor*r.IntentScore > gcIntentScoreThreshold r.FinalScore = r.FuzzFactor * (valScore + r.IntentScore) + if probablyLargeAbortSpan(ms) && !r.ShouldQueue && + (r.LikelyLastGC == 0 || r.LikelyLastGC > storagebase.TxnCleanupThreshold) { + r.ShouldQueue = true + r.FinalScore++ + } return r } diff --git a/pkg/storage/gc_queue_test.go b/pkg/storage/gc_queue_test.go index 6ae3c8256f56..729bd99c4f6b 100644 --- a/pkg/storage/gc_queue_test.go +++ b/pkg/storage/gc_queue_test.go @@ -36,6 +36,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/kr/pretty" "github.com/pkg/errors" + "github.com/stretchr/testify/require" "golang.org/x/sync/syncmap" ) @@ -103,7 +104,7 @@ func TestGCQueueMakeGCScoreInvariantQuick(t *testing.T) { GCBytesAge: gcByteAge, } now := initialNow.Add(timePassed.Nanoseconds(), 0) - r := makeGCQueueScoreImpl(ctx, int64(seed), now, ms, ttlSec) + r := makeGCQueueScoreImpl(ctx, int64(seed), now, ms, ttlSec, hlc.Timestamp{}) wouldHaveToDeleteSomething := gcBytes*int64(ttlSec) < ms.GCByteAge(now.WallTime) result := !r.ShouldQueue || wouldHaveToDeleteSomething if !result { @@ -124,13 +125,46 @@ func TestGCQueueMakeGCScoreAnomalousStats(t *testing.T) { LiveBytes: int64(liveBytes), ValBytes: int64(valBytes), KeyBytes: int64(keyBytes), - }, 60) + }, 60, hlc.Timestamp{}) return r.DeadFraction >= 0 && r.DeadFraction <= 1 }, &quick.Config{MaxCount: 1000}); err != nil { t.Fatal(err) } } +func TestGCQueueMakeGCScoreLargeAbortSpan(t *testing.T) { + defer leaktest.AfterTest(t)() + const seed = 1 + var ms enginepb.MVCCStats + ms.SysCount += probablyLargeAbortSpanSysCountThreshold + ms.SysBytes += probablyLargeAbortSpanSysBytesThreshold + + gcThresh := hlc.Timestamp{WallTime: 1} + expiration := storagebase.TxnCleanupThreshold.Nanoseconds() + 1 + const ttlSec = 10000 + + // GC triggered if abort span should all be gc'able and it's likely large. + { + r := makeGCQueueScoreImpl( + context.Background(), seed, + hlc.Timestamp{WallTime: expiration + 1}, + ms, ttlSec, gcThresh, + ) + require.True(t, r.ShouldQueue) + require.NotZero(t, r.FinalScore) + } + + // Heuristic doesn't fire if likely last GC within TxnCleanupThreshold. + { + r := makeGCQueueScoreImpl(context.Background(), seed, + hlc.Timestamp{WallTime: expiration}, + ms, ttlSec, gcThresh, + ) + require.False(t, r.ShouldQueue) + require.Zero(t, r.FinalScore) + } +} + const cacheFirstLen = 3 type gcTestCacheKey struct { @@ -244,7 +278,7 @@ func (cws *cachedWriteSimulator) shouldQueue( ) { cws.t.Helper() ts := hlc.Timestamp{}.Add(ms.LastUpdateNanos+after.Nanoseconds(), 0) - r := makeGCQueueScoreImpl(context.Background(), 0 /* seed */, ts, ms, int32(ttl.Seconds())) + r := makeGCQueueScoreImpl(context.Background(), 0 /* seed */, ts, ms, int32(ttl.Seconds()), hlc.Timestamp{}) if fmt.Sprintf("%.2f", r.FinalScore) != fmt.Sprintf("%.2f", prio) || b != r.ShouldQueue { cws.t.Errorf("expected queued=%t (is %t), prio=%.2f, got %.2f: after=%s, ttl=%s:\nms: %+v\nscore: %s", b, r.ShouldQueue, prio, r.FinalScore, after, ttl, ms, r)