diff --git a/sstable/buffer_pool.go b/sstable/buffer_pool.go index b12318f294..2e98d44dae 100644 --- a/sstable/buffer_pool.go +++ b/sstable/buffer_pool.go @@ -66,6 +66,15 @@ func (p *BufferPool) Init(initialSize int) { } } +// initPreallocated is like Init but for internal sstable package use in +// instances where a pre-allocated slice of []allocedBuffer already exists. It's +// used to avoid an extra allocation initializing BufferPool.pool. +func (p *BufferPool) initPreallocated(pool []allocedBuffer) { + *p = BufferPool{ + pool: pool[:0], + } +} + // Release releases all buffers held by the pool and resets the pool to an // uninitialized state. func (p *BufferPool) Release() { diff --git a/sstable/reader.go b/sstable/reader.go index 3c0ec5b672..b1ef0903ad 100644 --- a/sstable/reader.go +++ b/sstable/reader.go @@ -3055,6 +3055,14 @@ type Reader struct { rawTombstones bool mergerOK bool checksumType ChecksumType + // metaBufferPool is a buffer pool used exclusively when opening a table and + // loading its meta blocks. metaBufferPoolAlloc is used to batch-allocate + // the BufferPool.pool slice as a part of the Reader allocation. It's + // capacity 3 to accommodate the meta block (1), and both the compressed + // properties block (1) and decompressed properties block (1) + // simultaneously. + metaBufferPool BufferPool + metaBufferPoolAlloc [3]allocedBuffer } // Close implements DB.Close, as documented in the pebble package. @@ -3506,8 +3514,21 @@ func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) { } func (r *Reader) readMetaindex(metaindexBH BlockHandle) error { + // We use a BufferPool when reading metaindex blocks in order to avoid + // populating the block cache with these blocks. In heavy-write workloads, + // especially with high compaction concurrency, new tables may be created + // frequently. Populating the block cache with these metaindex blocks adds + // additional contention on the block cache mutexes (see #1997). + // Additionally, these blocks are exceedingly unlikely to be read again + // while they're still in the block cache except in misconfigurations with + // excessive sstables counts or a table cache that's far too small. + r.metaBufferPool.initPreallocated(r.metaBufferPoolAlloc[:0]) + // When we're finished, release the buffers we've allocated back to memory + // allocator. We don't expect to use metaBufferPool again. + defer r.metaBufferPool.Release() + b, err := r.readBlock( - context.Background(), metaindexBH, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) + context.Background(), metaindexBH, nil /* transform */, nil /* readHandle */, nil /* stats */, &r.metaBufferPool) if err != nil { return err } diff --git a/testdata/event_listener b/testdata/event_listener index c1da34e65b..c379235ad2 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -284,8 +284,8 @@ compact 1 2.0 K 0 B 0 (size == esti memtbl 1 256 K zmemtbl 0 0 B ztbl 0 0 B - bcache 8 1.2 K 11.1% (score == hit-rate) - tcache 1 680 B 40.0% (score == hit-rate) + bcache 6 1.1 K 11.1% (score == hit-rate) + tcache 1 800 B 40.0% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 0 filter - - 0.0% (score == utility) @@ -378,8 +378,8 @@ compact 1 4.0 K 0 B 0 (size == esti memtbl 1 512 K zmemtbl 0 0 B ztbl 0 0 B - bcache 16 2.5 K 14.3% (score == hit-rate) - tcache 1 680 B 50.0% (score == hit-rate) + bcache 12 2.3 K 14.3% (score == hit-rate) + tcache 1 800 B 50.0% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 0 filter - - 0.0% (score == utility) diff --git a/testdata/ingest b/testdata/ingest index 62c9018a41..2cbba8efeb 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -47,8 +47,8 @@ compact 0 0 B 0 B 0 (size == esti memtbl 1 256 K zmemtbl 0 0 B ztbl 0 0 B - bcache 8 1.2 K 42.9% (score == hit-rate) - tcache 1 680 B 50.0% (score == hit-rate) + bcache 6 1.2 K 35.7% (score == hit-rate) + tcache 1 800 B 50.0% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 0 filter - - 0.0% (score == utility) diff --git a/testdata/metrics b/testdata/metrics index a48140a1eb..0bc8cd3378 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -33,8 +33,8 @@ compact 0 0 B 0 B 0 (size == esti memtbl 1 256 K zmemtbl 1 256 K ztbl 0 0 B - bcache 4 560 B 0.0% (score == hit-rate) - tcache 1 680 B 0.0% (score == hit-rate) + bcache 3 528 B 0.0% (score == hit-rate) + tcache 1 800 B 0.0% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 1 filter - - 0.0% (score == utility) @@ -82,8 +82,8 @@ compact 1 0 B 0 B 0 (size == esti memtbl 1 256 K zmemtbl 2 512 K ztbl 2 1.2 K - bcache 7 1.1 K 42.9% (score == hit-rate) - tcache 2 1.3 K 66.7% (score == hit-rate) + bcache 5 1.0 K 42.9% (score == hit-rate) + tcache 2 1.6 K 66.7% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 2 filter - - 0.0% (score == utility) @@ -116,8 +116,8 @@ compact 1 0 B 0 B 0 (size == esti memtbl 1 256 K zmemtbl 1 256 K ztbl 2 1.2 K - bcache 7 1.1 K 42.9% (score == hit-rate) - tcache 2 1.3 K 66.7% (score == hit-rate) + bcache 5 1.0 K 42.9% (score == hit-rate) + tcache 2 1.6 K 66.7% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 2 filter - - 0.0% (score == utility) @@ -147,8 +147,8 @@ compact 1 0 B 0 B 0 (size == esti memtbl 1 256 K zmemtbl 1 256 K ztbl 1 633 B - bcache 4 560 B 42.9% (score == hit-rate) - tcache 1 680 B 66.7% (score == hit-rate) + bcache 3 528 B 42.9% (score == hit-rate) + tcache 1 800 B 66.7% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 1 filter - - 0.0% (score == utility) @@ -375,8 +375,8 @@ compact 2 4.8 K 0 B 0 (size == esti memtbl 1 1.0 M zmemtbl 0 0 B ztbl 0 0 B - bcache 16 2.4 K 34.4% (score == hit-rate) - tcache 3 2.0 K 57.9% (score == hit-rate) + bcache 12 2.3 K 31.1% (score == hit-rate) + tcache 3 2.3 K 57.9% (score == hit-rate) snaps 0 - 0 (score == earliest seq num) titers 0 filter - - 0.0% (score == utility)