Skip to content

Commit

Permalink
sstable: reduce block cache memory fragmentation
Browse files Browse the repository at this point in the history
Currently, the sstable writer contains heuristics to flush sstable
blocks once the size reaches a specified threshold. In CRDB this is
defined as 32KiB. However, when these blocks are loaded into memory
additional metadata is allocated sometimes exceeding the 32KiB threshold.
Since CRDB uses jemalloc, these allocations use a 40KiB size class which
leads to significant internal fragmentation. In addition, since the
system is unaware of these size classes we cannot design heuristics that
prioritize reducing memory fragmentation. Reducing internal
fragmentation can help reduce CRDB's memory footprint. This commit
decrements the target block size to prevent internal fragmentation for
small key-value pairs and adds support for optionally specifying size
classes to enable a new set of heuristics that will reduce internal
fragmentation for workloads with larger key-value pairs.

Fixes: #999.
  • Loading branch information
CheranMahalingam committed Apr 19, 2024
1 parent c34894c commit f874ba5
Show file tree
Hide file tree
Showing 13 changed files with 274 additions and 67 deletions.
9 changes: 5 additions & 4 deletions data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble/bloom"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/internal/cache"
"github.com/cockroachdb/pebble/internal/humanize"
"github.com/cockroachdb/pebble/internal/keyspan"
"github.com/cockroachdb/pebble/internal/private"
Expand Down Expand Up @@ -523,8 +524,8 @@ func runBuildRemoteCmd(td *datadriven.TestData, d *DB, storage remote.Storage) e
// Force two-level indexes if not already forced on or off.
blockSize = 5
}
writeOpts.BlockSize = int(blockSize)
writeOpts.IndexBlockSize = int(blockSize)
writeOpts.BlockSize = int(blockSize) + cache.ValueMetadataSize
writeOpts.IndexBlockSize = writeOpts.BlockSize

f, err := storage.CreateObject(path)
if err != nil {
Expand Down Expand Up @@ -1427,7 +1428,7 @@ func parseDBOptionsArgs(opts *Options, args []datadriven.CmdArg) error {
return err
}
for i := range opts.Levels {
opts.Levels[i].BlockSize = v
opts.Levels[i].BlockSize = v + cache.ValueMetadataSize
}
case "cache-size":
if opts.Cache != nil {
Expand All @@ -1445,7 +1446,7 @@ func parseDBOptionsArgs(opts *Options, args []datadriven.CmdArg) error {
return err
}
for i := range opts.Levels {
opts.Levels[i].IndexBlockSize = v
opts.Levels[i].IndexBlockSize = v + cache.ValueMetadataSize
}
case "target-file-size":
v, err := strconv.Atoi(cmdArg.Vals[0])
Expand Down
43 changes: 43 additions & 0 deletions internal/cache/value_cgo.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.

//go:build ((!invariants && !tracing) || race) && cgo
// +build !invariants,!tracing race
// +build cgo

package cache

import (
"unsafe"

"github.com/cockroachdb/pebble/internal/manual"
)

// ValueMetadataSize denotes the number of bytes of metadata allocated for a
// cache entry.
const ValueMetadataSize = int(unsafe.Sizeof(Value{}))

func newValue(n int) *Value {
if n == 0 {
return nil
}

// When we're not performing leak detection, the lifetime of the returned
// Value is exactly the lifetime of the backing buffer and we can manually
// allocate both.
b := manual.New(ValueMetadataSize + n)
v := (*Value)(unsafe.Pointer(&b[0]))
v.buf = b[ValueMetadataSize:]
v.ref.init(1)
return v
}

func (v *Value) free() {
// When we're not performing leak detection, the Value and buffer were
// allocated contiguously.
n := ValueMetadataSize + cap(v.buf)
buf := (*[manual.MaxArrayLen]byte)(unsafe.Pointer(v))[:n:n]
v.buf = nil
manual.Free(buf)
}
4 changes: 4 additions & 0 deletions internal/cache/value_invariants.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ import (
"github.com/cockroachdb/pebble/internal/manual"
)

// ValueMetadataSize denotes the number of bytes of metadata allocated for a
// cache entry.
const ValueMetadataSize = 0

// newValue creates a Value with a manually managed buffer of size n.
//
// This definition of newValue is used when either the "invariants" or
Expand Down
45 changes: 8 additions & 37 deletions internal/cache/value_normal.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,56 +2,27 @@
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.

//go:build (!invariants && !tracing) || race
//go:build ((!invariants && !tracing) || race) && !cgo
// +build !invariants,!tracing race
// +build !cgo

package cache

import (
"unsafe"

"github.com/cockroachdb/pebble/internal/manual"
)

const valueSize = int(unsafe.Sizeof(Value{}))
// ValueMetadataSize denotes the number of bytes of metadata allocated for a
// cache entry.
const ValueMetadataSize = 0

func newValue(n int) *Value {
if n == 0 {
return nil
}

if !cgoEnabled {
// If Cgo is disabled then all memory is allocated from the Go heap and we
// can't play the trick below to combine the Value and buffer allocation.
v := &Value{buf: make([]byte, n)}
v.ref.init(1)
return v
}

// When we're not performing leak detection, the lifetime of the returned
// Value is exactly the lifetime of the backing buffer and we can manually
// allocate both.
//
// TODO(peter): It may be better to separate the allocation of the value and
// the buffer in order to reduce internal fragmentation in malloc. If the
// buffer is right at a power of 2, adding valueSize might push the
// allocation over into the next larger size.
b := manual.New(valueSize + n)
v := (*Value)(unsafe.Pointer(&b[0]))
v.buf = b[valueSize:]
// Since Cgo is disabled then all memory is allocated from the Go heap we
// can't play the trick below to combine the Value and buffer allocation.
v := &Value{buf: make([]byte, n)}
v.ref.init(1)
return v
}

func (v *Value) free() {
if !cgoEnabled {
return
}

// When we're not performing leak detection, the Value and buffer were
// allocated contiguously.
n := valueSize + cap(v.buf)
buf := (*[manual.MaxArrayLen]byte)(unsafe.Pointer(v))[:n:n]
v.buf = nil
manual.Free(buf)
}
6 changes: 5 additions & 1 deletion iterator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/internal/bytealloc"
"github.com/cockroachdb/pebble/internal/cache"
"github.com/cockroachdb/pebble/internal/invalidating"
"github.com/cockroachdb/pebble/internal/manifest"
"github.com/cockroachdb/pebble/internal/testkeys"
Expand Down Expand Up @@ -1190,7 +1191,10 @@ func TestIteratorBlockIntervalFilter(t *testing.T) {
FormatMajorVersion: internalFormatNewest,
BlockPropertyCollectors: bpCollectors,
}
lo := LevelOptions{BlockSize: 1, IndexBlockSize: 1}
lo := LevelOptions{
BlockSize: 1 + cache.ValueMetadataSize,
IndexBlockSize: 1 + cache.ValueMetadataSize,
}
opts.Levels = append(opts.Levels, lo)

// Automatic compactions may compact away tombstones from L6, making
Expand Down
7 changes: 7 additions & 0 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,12 @@ type Options struct {
// to temporarily persist data spilled to disk for row-oriented SQL query execution.
EnableSQLRowSpillMetrics bool

// AllocatorSizeClasses provides a sorted list containing the supported size
// classes of the underlying memory allocator. This provides hints to the
// sstable block writer's flushing policy to select block sizes that
// preemptively reduce internal fragmentation when loaded into the block cache.
AllocatorSizeClasses []int

// private options are only used by internal tests or are used internally
// for facilitating upgrade paths of unconfigurable functionality.
private struct {
Expand Down Expand Up @@ -1970,6 +1976,7 @@ func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstab
writerOpts.FilterPolicy = levelOpts.FilterPolicy
writerOpts.FilterType = levelOpts.FilterType
writerOpts.IndexBlockSize = levelOpts.IndexBlockSize
writerOpts.AllocatorSizeClasses = o.AllocatorSizeClasses
return writerOpts
}

Expand Down
2 changes: 2 additions & 0 deletions sstable/data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ func optsFromArgs(td *datadriven.TestData, writerOpts *WriterOptions) error {
if err != nil {
return err
}
writerOpts.BlockSize += cache.ValueMetadataSize
case "index-block-size":
if len(arg.Vals) != 1 {
return errors.Errorf("%s: arg %s expects 1 value", td.Cmd, arg.Key)
Expand All @@ -50,6 +51,7 @@ func optsFromArgs(td *datadriven.TestData, writerOpts *WriterOptions) error {
if err != nil {
return err
}
writerOpts.IndexBlockSize += cache.ValueMetadataSize
case "filter":
writerOpts.FilterPolicy = bloom.FilterPolicy(10)
case "comparer-split-4b-suffix":
Expand Down
15 changes: 13 additions & 2 deletions sstable/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,15 +230,24 @@ type WriterOptions struct {
// 750MB sstables -- see
// https://github.com/cockroachdb/cockroach/issues/117113).
DisableValueBlocks bool

// AllocatorSizeClasses provides a sorted list containing the supported size
// classes of the underlying memory allocator. This provides hints to the
// writer's flushing policy to select block sizes that preemptively reduce
// internal fragmentation when loaded into the block cache.
AllocatorSizeClasses []int
}

func (o WriterOptions) ensureDefaults() WriterOptions {
if o.BlockRestartInterval <= 0 {
o.BlockRestartInterval = base.DefaultBlockRestartInterval
}
if o.BlockSize <= 0 {
// The target block size is decremented to reduce internal fragmentation when
// blocks are loaded into the block cache.
if o.BlockSize <= cache.ValueMetadataSize {
o.BlockSize = base.DefaultBlockSize
}
o.BlockSize -= cache.ValueMetadataSize
if o.BlockSizeThreshold <= 0 {
o.BlockSizeThreshold = base.DefaultBlockSizeThreshold
}
Expand All @@ -248,8 +257,10 @@ func (o WriterOptions) ensureDefaults() WriterOptions {
if o.Compression <= DefaultCompression || o.Compression >= NCompression {
o.Compression = SnappyCompression
}
if o.IndexBlockSize <= 0 {
if o.IndexBlockSize <= cache.ValueMetadataSize {
o.IndexBlockSize = o.BlockSize
} else {
o.IndexBlockSize -= cache.ValueMetadataSize
}
if o.MergerName == "" {
o.MergerName = base.DefaultMerger.Name
Expand Down
4 changes: 2 additions & 2 deletions sstable/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1406,8 +1406,8 @@ func TestReaderChecksumErrors(t *testing.T) {
}

w := NewWriter(objstorageprovider.NewFileWritable(f), WriterOptions{
BlockSize: blockSize,
IndexBlockSize: indexBlockSize,
BlockSize: blockSize + cache.ValueMetadataSize,
IndexBlockSize: indexBlockSize + cache.ValueMetadataSize,
Checksum: checksumType,
})
require.NoError(t, w.Set(bytes.Repeat([]byte("a"), blockSize), nil))
Expand Down
5 changes: 3 additions & 2 deletions sstable/test_fixtures.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (

"github.com/cockroachdb/pebble/bloom"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/internal/cache"
"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
"github.com/cockroachdb/pebble/vfs"
)
Expand Down Expand Up @@ -266,8 +267,8 @@ func (tf TestFixtureInfo) Build(fs vfs.FS, filename string) error {
}

const fixtureDefaultIndexBlockSize = math.MaxInt32
const fixtureSmallIndexBlockSize = 128
const fixtureBlockSize = 2048
const fixtureSmallIndexBlockSize = 128 + cache.ValueMetadataSize
const fixtureBlockSize = 2048 + cache.ValueMetadataSize
const fixtureFormat = TableFormatPebblev1

var fixtureComparer = func() *Comparer {
Expand Down
54 changes: 54 additions & 0 deletions sstable/testdata/flush_heuristics
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Block size exceeds target block size.
build key-size=0 val-size=0 block-size=64 target-size=64 threshold=59
----
true

# Block size does not exceed threshold size.
build key-size=0 val-size=0 block-size=59 target-size=64 threshold=59
----
false

# New block size exceeds the target size.
build key-size=1 val-size=1 block-size=60 target-size=64 threshold=32
----
true

# New block size does not exceed the target size.
build key-size=1 val-size=1 block-size=40 target-size=64 threshold=32
----
false

# New block size does not exceed the target size with hints enabled.
build key-size=1 val-size=1 block-size=36 target-size=64 threshold=0 hints=8,16,32,64,128
----
false

# New block size reduces internal fragmentation.
build key-size=1 val-size=60 block-size=38 target-size=64 threshold=0 hints=8,16,32,64,128
----
false

# New block size increases internal fragmentation.
build key-size=1 val-size=40 block-size=38 target-size=64 threshold=0 hints=8,16,32,64,128
----
true

# Block size target exceeded with hints enabled.
build key-size=1 val-size=1 block-size=64 target-size=64 threshold=0 hints=8,16,32,64,128
----
true

# Block size target exceeded, however, new block would reduce internal fragmentation.
build key-size=1 val-size=1 block-size=70 target-size=64 threshold=0 hints=8,16,32,64,128
----
false

# Fall back to heuristics with hints disabled when size class is limited.
build key-size=1 val-size=1 block-size=59 target-size=64 threshold=59 hints=8,16,32
----
false

# Flush when new size class could not be computed.
build key-size=1 val-size=60 block-size=50 target-size=64 threshold=0 hints=8,16,32,64
----
true
Loading

0 comments on commit f874ba5

Please sign in to comment.