From 87f0a80c9271cd24314d783225591d3cd68c4bc1 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Tue, 25 Jul 2023 15:01:44 -0400 Subject: [PATCH] db: keep up to one memtable for recycling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We've observed large allocations like the 64MB memtable allocation take 10ms+. This can add latency to the WAL/memtable rotation critical section during which the entire commit pipeline is stalled, contributing to batch commit tail latencies. This commit adapts the memtable lifecycle to keep the most recent obsolete memtable around for use as the next mutable memtable. This reduces the commit latency hiccup during a memtable rotation, and it also reduces block cache mutex contention (#1997) by reducing the number of times we must reserve memory from the block cache. ``` goos: linux goarch: amd64 pkg: github.com/cockroachdb/pebble cpu: Intel(R) Xeon(R) CPU @ 2.30GHz │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ RotateMemtables-24 120.7µ ± 2% 102.8µ ± 4% -14.85% (p=0.000 n=25) │ old.txt │ new.txt │ │ B/op │ B/op vs base │ RotateMemtables-24 124.3Ki ± 0% 124.0Ki ± 0% -0.27% (p=0.000 n=25) │ old.txt │ new.txt │ │ allocs/op │ allocs/op vs base │ RotateMemtables-24 114.0 ± 0% 111.0 ± 0% -2.63% (p=0.000 n=25) ``` Informs #2646. --- db.go | 77 +++++++++++++++++++++++++++++++++-------- db_test.go | 45 ++++++++++++++++++++---- mem_table.go | 40 ++++++++++++++------- metrics.go | 5 ++- testdata/event_listener | 4 +-- testdata/metrics | 12 +++---- 6 files changed, 141 insertions(+), 42 deletions(-) diff --git a/db.go b/db.go index d95f4e68f7..0d42c7e1f4 100644 --- a/db.go +++ b/db.go @@ -253,9 +253,14 @@ func (d defaultCPUWorkGranter) CPUWorkDone(_ CPUWorkHandle) {} type DB struct { // The count and size of referenced memtables. This includes memtables // present in DB.mu.mem.queue, as well as memtables that have been flushed - // but are still referenced by an inuse readState. + // but are still referenced by an inuse readState, as well as up to one + // memTable waiting to be reused and stored in d.memTableRecycle. memTableCount atomic.Int64 memTableReserved atomic.Int64 // number of bytes reserved in the cache for memtables + // memTableRecycle holds a pointer to an obsolete memtable. The next + // memtable allocation will reuse this memtable if it has not already been + // recycled. + memTableRecycle atomic.Pointer[memTable] // The size of the current log file (i.e. db.mu.log.queue[len(queue)-1]. logSize atomic.Uint64 @@ -1540,6 +1545,10 @@ func (d *DB) Close() error { // replay. mem.readerUnrefLocked(false) } + // If there's an unused, recycled memtable, we need to release its memory. + if obsoleteMemTable := d.memTableRecycle.Swap(nil); obsoleteMemTable != nil { + d.freeMemTable(obsoleteMemTable) + } if reserved := d.memTableReserved.Load(); reserved != 0 { err = firstError(err, errors.Errorf("leaked memtable reservation: %d", errors.Safe(reserved))) } @@ -2175,30 +2184,68 @@ func (d *DB) newMemTable(logNum FileNum, logSeqNum uint64) (*memTable, *flushabl } } - d.memTableCount.Add(1) - d.memTableReserved.Add(int64(size)) - releaseAccountingReservation := d.opts.Cache.Reserve(size) - - mem := newMemTable(memTableOptions{ + memtblOpts := memTableOptions{ Options: d.opts, - arenaBuf: manual.New(int(size)), logSeqNum: logSeqNum, - }) + } - // Note: this is a no-op if invariants are disabled or race is enabled. - invariants.SetFinalizer(mem, checkMemTable) + // Before attempting to allocate a new memtable, check if there's one + // available for recycling in memTableRecycle. Large contiguous allocations + // can be costly as fragmentation makes it more difficult to find a large + // contiguous free space. We've observed 64MB allocations taking 10ms+. + // + // To reduce these costly allocations, up to 1 obsolete memtable is stashed + // in `d.memTableRecycle` to allow a future memtable rotation to reuse + // existing memory. + var mem *memTable + mem = d.memTableRecycle.Swap(nil) + if mem != nil && len(mem.arenaBuf) != size { + d.freeMemTable(mem) + mem = nil + } + if mem != nil { + // Carry through the existing buffer and memory reservation. + memtblOpts.arenaBuf = mem.arenaBuf + memtblOpts.releaseAccountingReservation = mem.releaseAccountingReservation + } else { + mem = new(memTable) + memtblOpts.arenaBuf = manual.New(int(size)) + memtblOpts.releaseAccountingReservation = d.opts.Cache.Reserve(size) + d.memTableCount.Add(1) + d.memTableReserved.Add(int64(size)) + + // Note: this is a no-op if invariants are disabled or race is enabled. + invariants.SetFinalizer(mem, checkMemTable) + } + mem.init(memtblOpts) entry := d.newFlushableEntry(mem, logNum, logSeqNum) entry.releaseMemAccounting = func() { - manual.Free(mem.arenaBuf) - mem.arenaBuf = nil - d.memTableCount.Add(-1) - d.memTableReserved.Add(-int64(size)) - releaseAccountingReservation() + // If the user leaks iterators, we may be releasing the memtable after + // the DB is already closed. In this case, we want to just release the + // memory because DB.Close won't come along to free it for us. + if err := d.closed.Load(); err != nil { + d.freeMemTable(mem) + return + } + + // The next memtable allocation might be able to reuse this memtable. + // Stash it on d.memTableRecycle. + if unusedMem := d.memTableRecycle.Swap(mem); unusedMem != nil { + // There was already a memtable waiting to be recycled. We're now + // responsible for freeing it. + d.freeMemTable(unusedMem) + } } return mem, entry } +func (d *DB) freeMemTable(m *memTable) { + d.memTableCount.Add(-1) + d.memTableReserved.Add(-int64(len(m.arenaBuf))) + m.free() +} + func (d *DB) newFlushableEntry(f flushable, logNum FileNum, logSeqNum uint64) *flushableEntry { fe := &flushableEntry{ flushable: f, diff --git a/db_test.go b/db_test.go index 3ff2d0bafb..c375b797a2 100644 --- a/db_test.go +++ b/db_test.go @@ -827,20 +827,34 @@ func TestMemTableReservation(t *testing.T) { t.Fatalf("expected failure, but found success: %s", h.Get()) } - // Flush the memtable. The memtable reservation should be unchanged because a - // new memtable will be allocated. + // Flush the memtable. The memtable reservation should double because old + // memtable will be recycled, saved for the next memtable allocation. require.NoError(t, d.Flush()) - checkReserved(int64(opts.MemTableSize)) + checkReserved(int64(2 * opts.MemTableSize)) + // Flush again. The memtable reservation should be unchanged because at most + // 1 memtable may be preserved for recycling. // Flush in the presence of an active iterator. The iterator will hold a // reference to a readState which will in turn hold a reader reference to the - // memtable. While the iterator is open, there will be the memory for 2 - // memtables reserved. + // memtable. iter := d.NewIter(nil) require.NoError(t, d.Flush()) + // The flush moved the recycled memtable into position as an active mutable + // memtable. There are now two allocated memtables: 1 mutable and 1 pinned + // by the iterator's read state. checkReserved(2 * int64(opts.MemTableSize)) + + // Flushing again should increase the reservation total to 3x: 1 active + // mutable, 1 for recycling, 1 pinned by iterator's read state. + require.NoError(t, d.Flush()) + checkReserved(3 * int64(opts.MemTableSize)) + + // Closing the iterator will release the iterator's read state, and the old + // memtable will be moved into position as the next memtable to recycle. + // There was already a memtable ready to be recycled, so that memtable will + // be freed and the overall reservation total is reduced to 2x. require.NoError(t, iter.Close()) - checkReserved(int64(opts.MemTableSize)) + checkReserved(2 * int64(opts.MemTableSize)) require.NoError(t, d.Close()) } @@ -1935,3 +1949,22 @@ func verifyGetNotFound(t *testing.T, r Reader, key []byte) { t.Fatalf("expected nil, but got %s", val) } } + +func BenchmarkRotateMemtables(b *testing.B) { + o := &Options{FS: vfs.NewMem(), MemTableSize: 64 << 20 /* 64 MB */} + d, err := Open("", o) + require.NoError(b, err) + + // We want to jump to full-sized memtables. + d.mu.Lock() + d.mu.mem.nextSize = o.MemTableSize + d.mu.Unlock() + require.NoError(b, d.Flush()) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + if err := d.Flush(); err != nil { + b.Fatal(err) + } + } +} diff --git a/mem_table.go b/mem_table.go index 8136526714..195469d496 100644 --- a/mem_table.go +++ b/mem_table.go @@ -15,6 +15,7 @@ import ( "github.com/cockroachdb/pebble/internal/arenaskl" "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/internal/manual" "github.com/cockroachdb/pebble/internal/rangedel" "github.com/cockroachdb/pebble/internal/rangekey" ) @@ -84,7 +85,16 @@ type memTable struct { rangeKeys keySpanCache // The current logSeqNum at the time the memtable was created. This is // guaranteed to be less than or equal to any seqnum stored in the memtable. - logSeqNum uint64 + logSeqNum uint64 + releaseAccountingReservation func() +} + +func (m *memTable) free() { + if m != nil { + m.releaseAccountingReservation() + manual.Free(m.arenaBuf) + m.arenaBuf = nil + } } // memTableOptions holds configuration used when creating a memTable. All of @@ -92,9 +102,10 @@ type memTable struct { // which is used by tests. type memTableOptions struct { *Options - arenaBuf []byte - size int - logSeqNum uint64 + arenaBuf []byte + size int + logSeqNum uint64 + releaseAccountingReservation func() } func checkMemTable(obj interface{}) { @@ -109,16 +120,22 @@ func checkMemTable(obj interface{}) { // Options.MemTableSize is used instead. func newMemTable(opts memTableOptions) *memTable { opts.Options = opts.Options.EnsureDefaults() + m := new(memTable) + m.init(opts) + return m +} + +func (m *memTable) init(opts memTableOptions) { if opts.size == 0 { opts.size = opts.MemTableSize } - - m := &memTable{ - cmp: opts.Comparer.Compare, - formatKey: opts.Comparer.FormatKey, - equal: opts.Comparer.Equal, - arenaBuf: opts.arenaBuf, - logSeqNum: opts.logSeqNum, + *m = memTable{ + cmp: opts.Comparer.Compare, + formatKey: opts.Comparer.FormatKey, + equal: opts.Comparer.Equal, + arenaBuf: opts.arenaBuf, + logSeqNum: opts.logSeqNum, + releaseAccountingReservation: opts.releaseAccountingReservation, } m.writerRefs.Store(1) m.tombstones = keySpanCache{ @@ -143,7 +160,6 @@ func newMemTable(opts memTableOptions) *memTable { m.rangeDelSkl.Reset(arena, m.cmp) m.rangeKeySkl.Reset(arena, m.cmp) m.reserved = arena.Size() - return m } func (m *memTable) writerRef() { diff --git a/metrics.go b/metrics.go index ff17eac2c7..9410b0e567 100644 --- a/metrics.go +++ b/metrics.go @@ -184,7 +184,10 @@ type Metrics struct { // The count of memtables. Count int64 // The number of bytes present in zombie memtables which are no longer - // referenced by the current DB state but are still in use by an iterator. + // referenced by the current DB state. An unbounded number of memtables + // may be zombie if they're still in use by an iterator. One additional + // memtable may be zombie if it's no longer in use and waiting to be + // recycled. ZombieSize uint64 // The count of zombie memtables. ZombieCount int64 diff --git a/testdata/event_listener b/testdata/event_listener index 37811d053a..fbff2a33b4 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -284,7 +284,7 @@ WAL: 1 files (27B) in: 48B written: 108B (125% overhead) Flushes: 3 Compactions: 1 estimated debt: 2.0KB in progress: 0 (0B) default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (256KB) zombie: 0 (0B) +MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B) Block cache: 6 entries (1.1KB) hit rate: 11.1% Table cache: 1 entries (800B) hit rate: 40.0% @@ -380,7 +380,7 @@ WAL: 1 files (29B) in: 82B written: 110B (34% overhead) Flushes: 6 Compactions: 1 estimated debt: 4.0KB in progress: 0 (0B) default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (512KB) zombie: 0 (0B) +MemTables: 1 (512KB) zombie: 1 (512KB) Zombie tables: 0 (0B) Block cache: 12 entries (2.3KB) hit rate: 14.3% Table cache: 1 entries (800B) hit rate: 50.0% diff --git a/testdata/metrics b/testdata/metrics index 95f796abb3..2b8d8090b2 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -147,7 +147,7 @@ WAL: 1 files (28B) in: 34B written: 84B (147% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (256KB) zombie: 1 (256KB) +MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 2 (1.2KB) Block cache: 5 entries (1.0KB) hit rate: 42.9% Table cache: 2 entries (1.6KB) hit rate: 66.7% @@ -180,7 +180,7 @@ WAL: 1 files (28B) in: 34B written: 84B (147% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (256KB) zombie: 1 (256KB) +MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 1 (633B) Block cache: 3 entries (528B) hit rate: 42.9% Table cache: 1 entries (800B) hit rate: 66.7% @@ -216,7 +216,7 @@ WAL: 1 files (28B) in: 34B written: 84B (147% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (256KB) zombie: 0 (0B) +MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B) Block cache: 0 entries (0B) hit rate: 42.9% Table cache: 0 entries (0B) hit rate: 66.7% @@ -278,7 +278,7 @@ WAL: 1 files (93B) in: 116B written: 242B (109% overhead) Flushes: 3 Compactions: 1 estimated debt: 2.8KB in progress: 0 (0B) default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (256KB) zombie: 0 (0B) +MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B) Block cache: 0 entries (0B) hit rate: 42.9% Table cache: 0 entries (0B) hit rate: 66.7% @@ -324,7 +324,7 @@ WAL: 1 files (93B) in: 116B written: 242B (109% overhead) Flushes: 3 Compactions: 2 estimated debt: 0B in progress: 0 (0B) default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (256KB) zombie: 0 (0B) +MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B) Block cache: 0 entries (0B) hit rate: 27.3% Table cache: 0 entries (0B) hit rate: 58.3% @@ -416,7 +416,7 @@ WAL: 1 files (26B) in: 176B written: 175B (-1% overhead) Flushes: 8 Compactions: 2 estimated debt: 4.8KB in progress: 0 (0B) default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (1.0MB) zombie: 0 (0B) +MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B) Block cache: 12 entries (2.3KB) hit rate: 31.1% Table cache: 3 entries (2.3KB) hit rate: 57.9%