Skip to content

Commit

Permalink
sstable: reduce block cache fragmentation
Browse files Browse the repository at this point in the history
Previously, the sstable writer contained heuristics to flush sstable
blocks when the size reached a certain threshold. In CRDB this is
defined as 32KiB. However, when these blocks are loaded into memory
additional metadata is allocated with the block causing the allocation
to go beyond this threshold. Since CRDB uses jemalloc, these allocations
use a 40KiB size class which leads to internal fragmentation and higher
memory usage. This commit decrements the block size threshold to reduce
internal memory fragmentation.

Informs: #999.
  • Loading branch information
CheranMahalingam committed Apr 12, 2024
1 parent 278b6a6 commit afe5e62
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 7 deletions.
6 changes: 6 additions & 0 deletions internal/cache/value_invariants.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ import (
"github.com/cockroachdb/pebble/internal/manual"
)

// NewValueMetadataSize returns the number of bytes of metadata allocated for
// a cache entry.
func NewValueMetadataSize() int {
return 0
}

// newValue creates a Value with a manually managed buffer of size n.
//
// This definition of newValue is used when either the "invariants" or
Expand Down
14 changes: 9 additions & 5 deletions internal/cache/value_normal.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ import (

const valueSize = int(unsafe.Sizeof(Value{}))

// NewValueMetadataSize returns the number of bytes of metadata allocated for
// a cache entry.
func NewValueMetadataSize() int {
if cgoEnabled {
return valueSize
}
return 0
}

func newValue(n int) *Value {
if n == 0 {
return nil
Expand All @@ -31,11 +40,6 @@ func newValue(n int) *Value {
// When we're not performing leak detection, the lifetime of the returned
// Value is exactly the lifetime of the backing buffer and we can manually
// allocate both.
//
// TODO(peter): It may be better to separate the allocation of the value and
// the buffer in order to reduce internal fragmentation in malloc. If the
// buffer is right at a power of 2, adding valueSize might push the
// allocation over into the next larger size.
b := manual.New(valueSize + n)
v := (*Value)(unsafe.Pointer(&b[0]))
v.buf = b[valueSize:]
Expand Down
9 changes: 7 additions & 2 deletions sstable/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,9 +236,12 @@ func (o WriterOptions) ensureDefaults() WriterOptions {
if o.BlockRestartInterval <= 0 {
o.BlockRestartInterval = base.DefaultBlockRestartInterval
}
if o.BlockSize <= 0 {
// The target block size is decremented to reduce internal fragmentation when
// blocks are loaded into the block cache.
if o.BlockSize <= cache.NewValueMetadataSize() {
o.BlockSize = base.DefaultBlockSize
}
o.BlockSize -= cache.NewValueMetadataSize()
if o.BlockSizeThreshold <= 0 {
o.BlockSizeThreshold = base.DefaultBlockSizeThreshold
}
Expand All @@ -248,8 +251,10 @@ func (o WriterOptions) ensureDefaults() WriterOptions {
if o.Compression <= DefaultCompression || o.Compression >= NCompression {
o.Compression = SnappyCompression
}
if o.IndexBlockSize <= 0 {
if o.IndexBlockSize <= cache.NewValueMetadataSize() {
o.IndexBlockSize = o.BlockSize
} else {
o.IndexBlockSize -= cache.NewValueMetadataSize()
}
if o.MergerName == "" {
o.MergerName = base.DefaultMerger.Name
Expand Down

0 comments on commit afe5e62

Please sign in to comment.