diff --git a/db.go b/db.go index d95f4e68f7..0d42c7e1f4 100644 --- a/db.go +++ b/db.go @@ -253,9 +253,14 @@ func (d defaultCPUWorkGranter) CPUWorkDone(_ CPUWorkHandle) {} type DB struct { // The count and size of referenced memtables. This includes memtables // present in DB.mu.mem.queue, as well as memtables that have been flushed - // but are still referenced by an inuse readState. + // but are still referenced by an inuse readState, as well as up to one + // memTable waiting to be reused and stored in d.memTableRecycle. memTableCount atomic.Int64 memTableReserved atomic.Int64 // number of bytes reserved in the cache for memtables + // memTableRecycle holds a pointer to an obsolete memtable. The next + // memtable allocation will reuse this memtable if it has not already been + // recycled. + memTableRecycle atomic.Pointer[memTable] // The size of the current log file (i.e. db.mu.log.queue[len(queue)-1]. logSize atomic.Uint64 @@ -1540,6 +1545,10 @@ func (d *DB) Close() error { // replay. mem.readerUnrefLocked(false) } + // If there's an unused, recycled memtable, we need to release its memory. + if obsoleteMemTable := d.memTableRecycle.Swap(nil); obsoleteMemTable != nil { + d.freeMemTable(obsoleteMemTable) + } if reserved := d.memTableReserved.Load(); reserved != 0 { err = firstError(err, errors.Errorf("leaked memtable reservation: %d", errors.Safe(reserved))) } @@ -2175,30 +2184,68 @@ func (d *DB) newMemTable(logNum FileNum, logSeqNum uint64) (*memTable, *flushabl } } - d.memTableCount.Add(1) - d.memTableReserved.Add(int64(size)) - releaseAccountingReservation := d.opts.Cache.Reserve(size) - - mem := newMemTable(memTableOptions{ + memtblOpts := memTableOptions{ Options: d.opts, - arenaBuf: manual.New(int(size)), logSeqNum: logSeqNum, - }) + } - // Note: this is a no-op if invariants are disabled or race is enabled. - invariants.SetFinalizer(mem, checkMemTable) + // Before attempting to allocate a new memtable, check if there's one + // available for recycling in memTableRecycle. Large contiguous allocations + // can be costly as fragmentation makes it more difficult to find a large + // contiguous free space. We've observed 64MB allocations taking 10ms+. + // + // To reduce these costly allocations, up to 1 obsolete memtable is stashed + // in `d.memTableRecycle` to allow a future memtable rotation to reuse + // existing memory. + var mem *memTable + mem = d.memTableRecycle.Swap(nil) + if mem != nil && len(mem.arenaBuf) != size { + d.freeMemTable(mem) + mem = nil + } + if mem != nil { + // Carry through the existing buffer and memory reservation. + memtblOpts.arenaBuf = mem.arenaBuf + memtblOpts.releaseAccountingReservation = mem.releaseAccountingReservation + } else { + mem = new(memTable) + memtblOpts.arenaBuf = manual.New(int(size)) + memtblOpts.releaseAccountingReservation = d.opts.Cache.Reserve(size) + d.memTableCount.Add(1) + d.memTableReserved.Add(int64(size)) + + // Note: this is a no-op if invariants are disabled or race is enabled. + invariants.SetFinalizer(mem, checkMemTable) + } + mem.init(memtblOpts) entry := d.newFlushableEntry(mem, logNum, logSeqNum) entry.releaseMemAccounting = func() { - manual.Free(mem.arenaBuf) - mem.arenaBuf = nil - d.memTableCount.Add(-1) - d.memTableReserved.Add(-int64(size)) - releaseAccountingReservation() + // If the user leaks iterators, we may be releasing the memtable after + // the DB is already closed. In this case, we want to just release the + // memory because DB.Close won't come along to free it for us. + if err := d.closed.Load(); err != nil { + d.freeMemTable(mem) + return + } + + // The next memtable allocation might be able to reuse this memtable. + // Stash it on d.memTableRecycle. + if unusedMem := d.memTableRecycle.Swap(mem); unusedMem != nil { + // There was already a memtable waiting to be recycled. We're now + // responsible for freeing it. + d.freeMemTable(unusedMem) + } } return mem, entry } +func (d *DB) freeMemTable(m *memTable) { + d.memTableCount.Add(-1) + d.memTableReserved.Add(-int64(len(m.arenaBuf))) + m.free() +} + func (d *DB) newFlushableEntry(f flushable, logNum FileNum, logSeqNum uint64) *flushableEntry { fe := &flushableEntry{ flushable: f, diff --git a/db_test.go b/db_test.go index 3ff2d0bafb..c375b797a2 100644 --- a/db_test.go +++ b/db_test.go @@ -827,20 +827,34 @@ func TestMemTableReservation(t *testing.T) { t.Fatalf("expected failure, but found success: %s", h.Get()) } - // Flush the memtable. The memtable reservation should be unchanged because a - // new memtable will be allocated. + // Flush the memtable. The memtable reservation should double because old + // memtable will be recycled, saved for the next memtable allocation. require.NoError(t, d.Flush()) - checkReserved(int64(opts.MemTableSize)) + checkReserved(int64(2 * opts.MemTableSize)) + // Flush again. The memtable reservation should be unchanged because at most + // 1 memtable may be preserved for recycling. // Flush in the presence of an active iterator. The iterator will hold a // reference to a readState which will in turn hold a reader reference to the - // memtable. While the iterator is open, there will be the memory for 2 - // memtables reserved. + // memtable. iter := d.NewIter(nil) require.NoError(t, d.Flush()) + // The flush moved the recycled memtable into position as an active mutable + // memtable. There are now two allocated memtables: 1 mutable and 1 pinned + // by the iterator's read state. checkReserved(2 * int64(opts.MemTableSize)) + + // Flushing again should increase the reservation total to 3x: 1 active + // mutable, 1 for recycling, 1 pinned by iterator's read state. + require.NoError(t, d.Flush()) + checkReserved(3 * int64(opts.MemTableSize)) + + // Closing the iterator will release the iterator's read state, and the old + // memtable will be moved into position as the next memtable to recycle. + // There was already a memtable ready to be recycled, so that memtable will + // be freed and the overall reservation total is reduced to 2x. require.NoError(t, iter.Close()) - checkReserved(int64(opts.MemTableSize)) + checkReserved(2 * int64(opts.MemTableSize)) require.NoError(t, d.Close()) } @@ -1935,3 +1949,22 @@ func verifyGetNotFound(t *testing.T, r Reader, key []byte) { t.Fatalf("expected nil, but got %s", val) } } + +func BenchmarkRotateMemtables(b *testing.B) { + o := &Options{FS: vfs.NewMem(), MemTableSize: 64 << 20 /* 64 MB */} + d, err := Open("", o) + require.NoError(b, err) + + // We want to jump to full-sized memtables. + d.mu.Lock() + d.mu.mem.nextSize = o.MemTableSize + d.mu.Unlock() + require.NoError(b, d.Flush()) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + if err := d.Flush(); err != nil { + b.Fatal(err) + } + } +} diff --git a/mem_table.go b/mem_table.go index 8136526714..195469d496 100644 --- a/mem_table.go +++ b/mem_table.go @@ -15,6 +15,7 @@ import ( "github.com/cockroachdb/pebble/internal/arenaskl" "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/internal/manual" "github.com/cockroachdb/pebble/internal/rangedel" "github.com/cockroachdb/pebble/internal/rangekey" ) @@ -84,7 +85,16 @@ type memTable struct { rangeKeys keySpanCache // The current logSeqNum at the time the memtable was created. This is // guaranteed to be less than or equal to any seqnum stored in the memtable. - logSeqNum uint64 + logSeqNum uint64 + releaseAccountingReservation func() +} + +func (m *memTable) free() { + if m != nil { + m.releaseAccountingReservation() + manual.Free(m.arenaBuf) + m.arenaBuf = nil + } } // memTableOptions holds configuration used when creating a memTable. All of @@ -92,9 +102,10 @@ type memTable struct { // which is used by tests. type memTableOptions struct { *Options - arenaBuf []byte - size int - logSeqNum uint64 + arenaBuf []byte + size int + logSeqNum uint64 + releaseAccountingReservation func() } func checkMemTable(obj interface{}) { @@ -109,16 +120,22 @@ func checkMemTable(obj interface{}) { // Options.MemTableSize is used instead. func newMemTable(opts memTableOptions) *memTable { opts.Options = opts.Options.EnsureDefaults() + m := new(memTable) + m.init(opts) + return m +} + +func (m *memTable) init(opts memTableOptions) { if opts.size == 0 { opts.size = opts.MemTableSize } - - m := &memTable{ - cmp: opts.Comparer.Compare, - formatKey: opts.Comparer.FormatKey, - equal: opts.Comparer.Equal, - arenaBuf: opts.arenaBuf, - logSeqNum: opts.logSeqNum, + *m = memTable{ + cmp: opts.Comparer.Compare, + formatKey: opts.Comparer.FormatKey, + equal: opts.Comparer.Equal, + arenaBuf: opts.arenaBuf, + logSeqNum: opts.logSeqNum, + releaseAccountingReservation: opts.releaseAccountingReservation, } m.writerRefs.Store(1) m.tombstones = keySpanCache{ @@ -143,7 +160,6 @@ func newMemTable(opts memTableOptions) *memTable { m.rangeDelSkl.Reset(arena, m.cmp) m.rangeKeySkl.Reset(arena, m.cmp) m.reserved = arena.Size() - return m } func (m *memTable) writerRef() { diff --git a/metrics.go b/metrics.go index ff17eac2c7..9410b0e567 100644 --- a/metrics.go +++ b/metrics.go @@ -184,7 +184,10 @@ type Metrics struct { // The count of memtables. Count int64 // The number of bytes present in zombie memtables which are no longer - // referenced by the current DB state but are still in use by an iterator. + // referenced by the current DB state. An unbounded number of memtables + // may be zombie if they're still in use by an iterator. One additional + // memtable may be zombie if it's no longer in use and waiting to be + // recycled. ZombieSize uint64 // The count of zombie memtables. ZombieCount int64 diff --git a/testdata/event_listener b/testdata/event_listener index 37811d053a..fbff2a33b4 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -284,7 +284,7 @@ WAL: 1 files (27B) in: 48B written: 108B (125% overhead) Flushes: 3 Compactions: 1 estimated debt: 2.0KB in progress: 0 (0B) default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (256KB) zombie: 0 (0B) +MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B) Block cache: 6 entries (1.1KB) hit rate: 11.1% Table cache: 1 entries (800B) hit rate: 40.0% @@ -380,7 +380,7 @@ WAL: 1 files (29B) in: 82B written: 110B (34% overhead) Flushes: 6 Compactions: 1 estimated debt: 4.0KB in progress: 0 (0B) default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (512KB) zombie: 0 (0B) +MemTables: 1 (512KB) zombie: 1 (512KB) Zombie tables: 0 (0B) Block cache: 12 entries (2.3KB) hit rate: 14.3% Table cache: 1 entries (800B) hit rate: 50.0% diff --git a/testdata/metrics b/testdata/metrics index 95f796abb3..2b8d8090b2 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -147,7 +147,7 @@ WAL: 1 files (28B) in: 34B written: 84B (147% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (256KB) zombie: 1 (256KB) +MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 2 (1.2KB) Block cache: 5 entries (1.0KB) hit rate: 42.9% Table cache: 2 entries (1.6KB) hit rate: 66.7% @@ -180,7 +180,7 @@ WAL: 1 files (28B) in: 34B written: 84B (147% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (256KB) zombie: 1 (256KB) +MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 1 (633B) Block cache: 3 entries (528B) hit rate: 42.9% Table cache: 1 entries (800B) hit rate: 66.7% @@ -216,7 +216,7 @@ WAL: 1 files (28B) in: 34B written: 84B (147% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (256KB) zombie: 0 (0B) +MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B) Block cache: 0 entries (0B) hit rate: 42.9% Table cache: 0 entries (0B) hit rate: 66.7% @@ -278,7 +278,7 @@ WAL: 1 files (93B) in: 116B written: 242B (109% overhead) Flushes: 3 Compactions: 1 estimated debt: 2.8KB in progress: 0 (0B) default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (256KB) zombie: 0 (0B) +MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B) Block cache: 0 entries (0B) hit rate: 42.9% Table cache: 0 entries (0B) hit rate: 66.7% @@ -324,7 +324,7 @@ WAL: 1 files (93B) in: 116B written: 242B (109% overhead) Flushes: 3 Compactions: 2 estimated debt: 0B in progress: 0 (0B) default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (256KB) zombie: 0 (0B) +MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B) Block cache: 0 entries (0B) hit rate: 27.3% Table cache: 0 entries (0B) hit rate: 58.3% @@ -416,7 +416,7 @@ WAL: 1 files (26B) in: 176B written: 175B (-1% overhead) Flushes: 8 Compactions: 2 estimated debt: 4.8KB in progress: 0 (0B) default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 multi-level: 0 -MemTables: 1 (1.0MB) zombie: 0 (0B) +MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B) Block cache: 12 entries (2.3KB) hit rate: 31.1% Table cache: 3 entries (2.3KB) hit rate: 57.9%