Skip to content

Commit

Permalink
storage: trigger GC based on SysCount/SysBytes
Browse files Browse the repository at this point in the history
If there is "a lot" of data in Sys{Bytes,Count}, then we are likely
experiencing a large abort span. The abort span is not supposed to
become that large, but it does happen and causes stability fallout,
usually due to a combination of shortcomings:

1. there's no trigger for GC based on abort span size alone (before
   this commit)
2. transaction aborts tended to create unnecessary abort span entries,
   fixed (and 19.2-backported) in:
   cockroachdb#42765
3. aborting transactions in a busy loop:
   cockroachdb#38088
   (and we suspect this also happens in user apps occasionally)
4. large snapshots would never complete due to the queue time limits
   (addressed in cockroachdb#44952).

In an ideal world, we would factor in the abort span into this method
directly, but until then the condition guarding this block will do.
At worst, there is some other reason for SysBytes become that large
while also incurring a large SysCount, but I'm not sure how this
would happen. The only other things in this span are the versioned
range descriptors (which follow regular GC and it's only ever adding
a SysCount of one), or transaction records (which expire like abort
span records).

Release note (bug fix): Range garbage collection will now trigger
based on a large abort span, adding defense-in-depth against ranges
growing large (and eventually unstable).
  • Loading branch information
tbg committed Mar 5, 2020
1 parent 45ae737 commit 414d464
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 10 deletions.
49 changes: 42 additions & 7 deletions pkg/storage/gc_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,34 @@ const (
// gcKeyVersionChunkBytes is the threshold size for splitting
// GCRequests into multiple batches.
gcKeyVersionChunkBytes = base.ChunkRaftCommandThresholdBytes

probablyLargeAbortSpanSysCountThreshold = 10000
probablyLargeAbortSpanSysBytesThreshold = 16 * (1 << 20) // 16mb
)

func probablyLargeAbortSpan(ms enginepb.MVCCStats) bool {
// If there is "a lot" of data in Sys{Bytes,Count}, then we are likely
// experiencing a large abort span. The abort span is not supposed to
// become that large, but it does happen and causes stability fallout,
// usually due to a combination of shortcomings:
//
// 1. there's no trigger for GC based on abort span size alone (before
// this code block here was written)
// 2. transaction aborts tended to create unnecessary abort span entries,
// fixed (and 19.2-backported) in:
// https://github.com/cockroachdb/cockroach/pull/42765
// 3. aborting transactions in a busy loop:
// https://github.com/cockroachdb/cockroach/issues/38088
// (and we suspect this also happens in user apps occasionally)
// 4. large snapshots would never complete due to the queue time limits
// (addressed in https://github.com/cockroachdb/cockroach/pull/44952).
//
// In an ideal world, we would factor in the abort span into this method
// directly, but until then the condition guarding this block will do.
return ms.SysCount >= probablyLargeAbortSpanSysCountThreshold &&
ms.SysBytes >= probablyLargeAbortSpanSysBytesThreshold
}

// gcQueue manages a queue of replicas slated to be scanned in their
// entirety using the MVCC versions iterator. The gc queue manages the
// following tasks:
Expand Down Expand Up @@ -165,12 +191,8 @@ func makeGCQueueScore(
// Use desc.RangeID for fuzzing the final score, so that different ranges
// have slightly different priorities and even symmetrical workloads don't
// trigger GC at the same time.
r := makeGCQueueScoreImpl(
ctx, int64(desc.RangeID), now, ms, zone.GC.TTLSeconds,
)
if (gcThreshold != hlc.Timestamp{}) {
r.LikelyLastGC = time.Duration(now.WallTime - gcThreshold.Add(r.TTL.Nanoseconds(), 0).WallTime)
}
r := makeGCQueueScoreImpl(ctx, int64(desc.RangeID), now, ms, zone.GC.TTLSeconds, gcThreshold)

return r
}

Expand Down Expand Up @@ -263,10 +285,18 @@ func makeGCQueueScore(
// ttl*GCBytes`, and that a decent trigger for GC is a multiple of
// `ttl*GCBytes`.
func makeGCQueueScoreImpl(
ctx context.Context, fuzzSeed int64, now hlc.Timestamp, ms enginepb.MVCCStats, ttlSeconds int32,
ctx context.Context,
fuzzSeed int64,
now hlc.Timestamp,
ms enginepb.MVCCStats,
ttlSeconds int32,
gcThreshold hlc.Timestamp,
) gcQueueScore {
ms.Forward(now.WallTime)
var r gcQueueScore
if (gcThreshold != hlc.Timestamp{}) {
r.LikelyLastGC = time.Duration(now.WallTime - gcThreshold.Add(r.TTL.Nanoseconds(), 0).WallTime)
}
r.TTL = time.Duration(ttlSeconds) * time.Second

// Treat a zero TTL as a one-second TTL, which avoids a priority of infinity
Expand Down Expand Up @@ -329,6 +359,11 @@ func makeGCQueueScoreImpl(
r.ShouldQueue = r.FuzzFactor*valScore > gcKeyScoreThreshold || r.FuzzFactor*r.IntentScore > gcIntentScoreThreshold
r.FinalScore = r.FuzzFactor * (valScore + r.IntentScore)

if probablyLargeAbortSpan(ms) && !r.ShouldQueue &&
(r.LikelyLastGC == 0 || r.LikelyLastGC > storagebase.TxnCleanupThreshold) {
r.ShouldQueue = true
r.FinalScore++
}
return r
}

Expand Down
40 changes: 37 additions & 3 deletions pkg/storage/gc_queue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/kr/pretty"
"github.com/pkg/errors"
"github.com/stretchr/testify/require"
"golang.org/x/sync/syncmap"
)

Expand Down Expand Up @@ -103,7 +104,7 @@ func TestGCQueueMakeGCScoreInvariantQuick(t *testing.T) {
GCBytesAge: gcByteAge,
}
now := initialNow.Add(timePassed.Nanoseconds(), 0)
r := makeGCQueueScoreImpl(ctx, int64(seed), now, ms, ttlSec)
r := makeGCQueueScoreImpl(ctx, int64(seed), now, ms, ttlSec, hlc.Timestamp{})
wouldHaveToDeleteSomething := gcBytes*int64(ttlSec) < ms.GCByteAge(now.WallTime)
result := !r.ShouldQueue || wouldHaveToDeleteSomething
if !result {
Expand All @@ -124,13 +125,46 @@ func TestGCQueueMakeGCScoreAnomalousStats(t *testing.T) {
LiveBytes: int64(liveBytes),
ValBytes: int64(valBytes),
KeyBytes: int64(keyBytes),
}, 60)
}, 60, hlc.Timestamp{})
return r.DeadFraction >= 0 && r.DeadFraction <= 1
}, &quick.Config{MaxCount: 1000}); err != nil {
t.Fatal(err)
}
}

func TestGCQueueMakeGCScoreLargeAbortSpan(t *testing.T) {
defer leaktest.AfterTest(t)()
const seed = 1
var ms enginepb.MVCCStats
ms.SysCount += probablyLargeAbortSpanSysCountThreshold
ms.SysBytes += probablyLargeAbortSpanSysBytesThreshold

gcThresh := hlc.Timestamp{WallTime: 1}
expiration := storagebase.TxnCleanupThreshold.Nanoseconds() + 1
const ttlSec = 10000

// GC triggered if abort span should all be gc'able and it's likely large.
{
r := makeGCQueueScoreImpl(
context.Background(), seed,
hlc.Timestamp{WallTime: expiration + 1},
ms, ttlSec, gcThresh,
)
require.True(t, r.ShouldQueue)
require.NotZero(t, r.FinalScore)
}

// Heuristic doesn't fire if likely last GC within TxnCleanupThreshold.
{
r := makeGCQueueScoreImpl(context.Background(), seed,
hlc.Timestamp{WallTime: expiration},
ms, ttlSec, gcThresh,
)
require.False(t, r.ShouldQueue)
require.Zero(t, r.FinalScore)
}
}

const cacheFirstLen = 3

type gcTestCacheKey struct {
Expand Down Expand Up @@ -244,7 +278,7 @@ func (cws *cachedWriteSimulator) shouldQueue(
) {
cws.t.Helper()
ts := hlc.Timestamp{}.Add(ms.LastUpdateNanos+after.Nanoseconds(), 0)
r := makeGCQueueScoreImpl(context.Background(), 0 /* seed */, ts, ms, int32(ttl.Seconds()))
r := makeGCQueueScoreImpl(context.Background(), 0 /* seed */, ts, ms, int32(ttl.Seconds()), hlc.Timestamp{})
if fmt.Sprintf("%.2f", r.FinalScore) != fmt.Sprintf("%.2f", prio) || b != r.ShouldQueue {
cws.t.Errorf("expected queued=%t (is %t), prio=%.2f, got %.2f: after=%s, ttl=%s:\nms: %+v\nscore: %s",
b, r.ShouldQueue, prio, r.FinalScore, after, ttl, ms, r)
Expand Down

0 comments on commit 414d464

Please sign in to comment.