From 70be36b2155c2fa58e16ab0b193e3a4286d8e6b1 Mon Sep 17 00:00:00 2001 From: Cheran Mahalingam Date: Tue, 9 Apr 2024 18:46:18 -0400 Subject: [PATCH] sstable: reduce block cache fragmentation Previously, the sstable writer contained heuristics to flush sstable blocks when the size reached a certain threshold. In CRDB this is defined as 32KiB. However, when these blocks are loaded into memory additional metadata is allocated with the block causing the allocation to go beyond this threshold. Since CRDB uses jemalloc, these allocations use a 40KiB size class which leads to internal fragmentation and higher memory usage. This commit decrements the block size threshold to reduce internal memory fragmentation. --- internal/cache/clockpro.go | 1 + internal/cache/value_invariants.go | 6 ++++++ internal/cache/value_normal.go | 16 +++++++++++----- sstable/options.go | 9 +++++++-- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/internal/cache/clockpro.go b/internal/cache/clockpro.go index d39c0725969..61f92acdc8b 100644 --- a/internal/cache/clockpro.go +++ b/internal/cache/clockpro.go @@ -683,6 +683,7 @@ type Cache struct { // defer c.Unref() // d, err := pebble.Open(pebble.Options{Cache: c}) func New(size int64) *Cache { + fmt.Printf("NEW CACHE SIZE %d\n", size) // How many cache shards should we create? // // Note that the probability two processors will try to access the same diff --git a/internal/cache/value_invariants.go b/internal/cache/value_invariants.go index 1e30d2714bd..a01a780b09c 100644 --- a/internal/cache/value_invariants.go +++ b/internal/cache/value_invariants.go @@ -15,6 +15,12 @@ import ( "github.com/cockroachdb/pebble/internal/manual" ) +// NewValueMetadataSize returns the number of bytes of metadata allocated for +// a cache entry. +func NewValueMetadataSize() int { + return 0 +} + // newValue creates a Value with a manually managed buffer of size n. // // This definition of newValue is used when either the "invariants" or diff --git a/internal/cache/value_normal.go b/internal/cache/value_normal.go index e03379d53f8..880701d931a 100644 --- a/internal/cache/value_normal.go +++ b/internal/cache/value_normal.go @@ -8,6 +8,7 @@ package cache import ( + "fmt" "unsafe" "github.com/cockroachdb/pebble/internal/manual" @@ -15,6 +16,15 @@ import ( const valueSize = int(unsafe.Sizeof(Value{})) +// NewValueMetadataSize returns the number of bytes of metadata allocated for +// a cache entry. +func NewValueMetadataSize() int { + if cgoEnabled { + return valueSize + } + return 0 +} + func newValue(n int) *Value { if n == 0 { return nil @@ -31,11 +41,7 @@ func newValue(n int) *Value { // When we're not performing leak detection, the lifetime of the returned // Value is exactly the lifetime of the backing buffer and we can manually // allocate both. - // - // TODO(peter): It may be better to separate the allocation of the value and - // the buffer in order to reduce internal fragmentation in malloc. If the - // buffer is right at a power of 2, adding valueSize might push the - // allocation over into the next larger size. + fmt.Printf("ALLOC SIZE %d\n", valueSize+n) b := manual.New(valueSize + n) v := (*Value)(unsafe.Pointer(&b[0])) v.buf = b[valueSize:] diff --git a/sstable/options.go b/sstable/options.go index 8d88a32debd..df216159e68 100644 --- a/sstable/options.go +++ b/sstable/options.go @@ -236,9 +236,12 @@ func (o WriterOptions) ensureDefaults() WriterOptions { if o.BlockRestartInterval <= 0 { o.BlockRestartInterval = base.DefaultBlockRestartInterval } - if o.BlockSize <= 0 { + // The target block size is decremented to reduce internal fragmentation when + // blocks are loaded into the block cache. + if o.BlockSize <= cache.NewValueMetadataSize() { o.BlockSize = base.DefaultBlockSize } + o.BlockSize -= 0 if o.BlockSizeThreshold <= 0 { o.BlockSizeThreshold = base.DefaultBlockSizeThreshold } @@ -248,8 +251,10 @@ func (o WriterOptions) ensureDefaults() WriterOptions { if o.Compression <= DefaultCompression || o.Compression >= NCompression { o.Compression = SnappyCompression } - if o.IndexBlockSize <= 0 { + if o.IndexBlockSize <= cache.NewValueMetadataSize() { o.IndexBlockSize = o.BlockSize + } else { + o.IndexBlockSize -= 0 } if o.MergerName == "" { o.MergerName = base.DefaultMerger.Name