Discard invalid versions of keys during compaction #464

jiminoc · 2018-04-24T17:32:58Z

I'm hoping this is a configuration related issue but I've played around with the settings and I keep getting the same behavior. Tested on an i3.4XL in AWS, raid0 on the two SSD drives.

Expected behavior of the code below:

keys/data are stored for 1hr, after a few hours the badger directory should stay fairly constant as you write/expire keys
I would expect to see sst files written and multiple size levels each level a larger file size
memory should stay fairly consistent

Actual behavior seen:

OOM's after 12 hours
all sst files at 67MB (thousands of them)
disk fills up on a 4TB drive, no data appears to ttl out
file counts steadily increase until oom (there's no leveling off)
every hour the process stalls (assuming the stall count is being hit according to profiler)

Please advise of what is wrong in the code below, thanks!

3HRs of runtime you can see just linear growth
https://imgur.com/a/2UUfIrG

UPDATE:
I've also tried with these settings and memory doesn't grow as fast but it linearly climbs until OOM as well and the same behavior as above

dir := "/raid0/badgertest"
opts := badger.DefaultOptions
opts.Dir = dir
opts.ValueDir = dir
opts.TableLoadingMode = options.FileIO
opts.SyncWrites = false
db, err := badger.Open(opts)

package usecases

import (
	"github.com/dgraph-io/badger"
	"github.com/dgraph-io/badger/options"
	"time"
	"fmt"
	"encoding/binary"
	"github.com/spaolacci/murmur3"
	"path/filepath"
	"os"
	"github.com/Sirupsen/logrus"
)

type writable struct {
	key   []byte
	value []byte
}


type BadgerTest struct {
	db *badger.DB
}

func NewBadgerTest() *BadgerTest {

	dir := "/raid0/badgertest"
	opts := badger.DefaultOptions
	opts.Dir = dir
	opts.ValueDir = dir
	opts.TableLoadingMode = options.MemoryMap
	opts.NumCompactors = 1
	opts.NumLevelZeroTables = 20
	opts.NumLevelZeroTablesStall = 50
	opts.SyncWrites = false
	db, err := badger.Open(opts)
	if err != nil {
		panic(fmt.Sprintf("unable to open badger db; %s", err))
	}
	bt := &BadgerTest{
		db: db,
	}

	go bt.filecounts(dir)
	return bt

}

func (b *BadgerTest) Start() {

	workers := 4
	for i := 0; i < workers; i++ {
		go b.write()
	}
	go b.badgerGC()

}
func (b *BadgerTest) Stop() {

	b.db.Close()
	logrus.Infof("shut down badger test")
	time.Sleep(1 * time.Second)
}

func (b *BadgerTest) filecounts(dir string) {

	ticker := time.NewTicker(60 * time.Second)
	for _ = range ticker.C {

		logFiles := int64(0)
		sstFiles := int64(0)
		_ = filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {

			if filepath.Ext(path) == ".sst" {
				sstFiles++
			}
			if filepath.Ext(path) == ".vlog" {
				logFiles++
			}
			return nil
		})


		logrus.Infof("updated gauges vlog=%d sst=%d", logFiles, sstFiles)

	}

}

func (b *BadgerTest) write() {

	data := `{"randomstring":"6446D58D6DFAFD58586D3EA85A53F4A6B3CC057F933A22BB58E188A74AC8F663","refID":12495494,"testfield1234":"foo bar baz","date":"2018-01-01 12:00:00"}`
	batchSize := 20000
	rows := []writable{}
	var cnt uint64
	for {
		cnt++
		ts := time.Now().UnixNano()
		buf := make([]byte, 24)
		offset := 0
		binary.BigEndian.PutUint64(buf[offset:], uint64(ts))
		offset = offset + 8
		key := fmt.Sprintf("%d%d", ts, cnt)
		mkey := murmur3.Sum64([]byte(key))
		binary.BigEndian.PutUint64(buf[offset:], mkey)

		offset = offset + 8
		binary.BigEndian.PutUint64(buf[offset:], cnt)

		w := writable{key: buf, value: []byte(data)}
		rows = append(rows, w)
		if len(rows) > batchSize {
			b.saveRows(rows)
			rows = []writable{}
		}
	}

}

func (b *BadgerTest) saveRows(rows []writable) {
	ttl := 1 * time.Hour

	_ = b.db.Update(func(txn *badger.Txn) error {
		var err error
		for _, row := range rows {
			testMsgMeter.Mark(1)
			if err := txn.SetWithTTL(row.key, row.value, ttl); err == badger.ErrTxnTooBig {
				logrus.Infof("TX too big, committing...")
				_ = txn.Commit(nil)
				txn = b.db.NewTransaction(true)
				err = txn.SetWithTTL(row.key, row.value, ttl)
			}
		}
		return err
	})
}

func (b *BadgerTest) badgerGC() {

	ticker := time.NewTicker(1 * time.Minute)
	for {
		select {
		case <-ticker.C:
			logrus.Infof("CLEANUP starting to purge keys %s", time.Now())
			err := b.db.PurgeOlderVersions()
			if err != nil {
				logrus.Errorf("badgerOps unable to purge older versions; %s", err)
			}
			err = b.db.RunValueLogGC(0.5)
			if err != nil {
				logrus.Errorf("badgerOps unable to RunValueLogGC; %s", err)
			}
			logrus.Infof("CLEANUP purge complete %s", time.Now())
		}
	}
}

The text was updated successfully, but these errors were encountered:

jiminoc · 2018-04-25T18:30:19Z

This is what it looks like running as FiloIO for TableLoadingMode.
it steadily increases RAM until it spikes and gets killed by OOM killer
https://imgur.com/a/IIlHILx

jiminoc · 2018-04-26T16:10:44Z

more craziness... test case: start up a writer, write TTL keys for 5 minutes, shut off and watch for compaction to clean everything up and purge.

expected result:
after no more writes are coming in, compaction should catch up to remove vlog entries/files and clean up SST tables.

actual result:
vlog files seem to be cleaned up but sstables never shrink in number of files on disk.
https://imgur.com/a/YobQCkw

jiminoc · 2018-04-27T21:55:08Z

memory profiles in use and allocated

retested with just 1 writer routine with a 30 minute TTL, memory increases over time, eventually leading to OOM

disk usage continues to grow and never levels out

latest settings:
dir := "/raid0/badgertest"
opts := badger.DefaultOptions
opts.Dir = dir
opts.ValueDir = dir
opts.ValueLogLoadingMode = options.FileIO
opts.TableLoadingMode = options.FileIO
opts.NumCompactors = 20 // I've tried 3, 6, 20
opts.MaxTableSize = .25 * 1073741824 // .25GB
opts.NumMemtables = 2 // I've tried 1, 2, 5, 10
opts.ValueLogFileSize = 2 * 1073741824 // 2GB
opts.SyncWrites = false

manishrjain · 2018-04-28T03:40:14Z

Looks like as SSTs are growing, bloom filters are taking up more and more space, because they're always loaded into memory. We can probably use an LRU cache for blooms to avoid their memory usage growing forever.

I'll run your code over the weekend, and check if I get the OOM, and what to do about it. Ping if you don't hear by Tuesday.

jiminoc · 2018-04-28T05:22:17Z

here is a complete working example with a main() func

package main

import (
	"github.com/dgraph-io/badger"
	"github.com/dgraph-io/badger/options"
	"time"
	"fmt"
	"encoding/binary"
	"path/filepath"
	"os"
	"github.com/Sirupsen/logrus"
	"encoding/json"
	_ "net/http/pprof"
	"net/http"
)

type writable struct {
	key   []byte
	value []byte
}

var (
	// store the first key we've seen so we can try and query it to see when it's expired from view
	firstKey       []byte
	firstKeyStored bool //
	stopWriting    = false
)

func main() {

	go func() {
		// you can hit:
		// go tool pprof http://localhost:8001/debug/pprof/heap
		// go tool pprof http://localhost:8001/debug/pprof/profile
		logrus.Infof("starting debug web server....")
		logrus.Info(http.ListenAndServe("localhost:8001", nil))
	}()
	done := make(chan struct{})
	bt := NewBadgerTest()
	bt.Start()

	<-done // wait

}

type BadgerTest struct {
	db *badger.DB
}

func NewBadgerTest() *BadgerTest {

	dir := "/raid0/badgertest"
	opts := badger.DefaultOptions
	opts.Dir = dir
	opts.ValueDir = dir
	opts.ValueLogLoadingMode = options.FileIO
	opts.TableLoadingMode = options.FileIO
	opts.NumCompactors = 20
	opts.MaxTableSize = .25 * 1073741824 // .25GB
	opts.NumMemtables = 2
	opts.ValueLogFileSize = 1 * 1073741824 // 2GB
	opts.SyncWrites = false

	bytes, _ := json.Marshal(&opts)
	logrus.Infof("BADGER OPTIONS=%s", string(bytes))

	db, err := badger.Open(opts)
	if err != nil {
		panic(fmt.Sprintf("unable to open badger db; %s", err))
	}
	bt := &BadgerTest{
		db: db,
	}

	go bt.filecounts(dir)
	return bt

}

func (b *BadgerTest) Start() {

	workers := 1
	for i := 0; i < workers; i++ {
		go b.write()
	}
	//go b.write()
	go b.badgerGC()

	go func() {
		tick := time.NewTicker(1 * time.Minute)
		for _ = range tick.C {
			b.readKey()
		}
	}()

	//go func() {
	//	tick := time.NewTicker(15 * time.Minute)
	//	for _ = range tick.C {
	//		stopWriting = true
	//		return
	//	}
	//}()

}
func (b *BadgerTest) Stop() {

	b.db.Close()
	logrus.Infof("%s shut down badger test", time.Now())
	time.Sleep(1 * time.Second)
}

func (b *BadgerTest) filecounts(dir string) {

	ticker := time.NewTicker(60 * time.Second)
	for _ = range ticker.C {

		logFiles := int64(0)
		sstFiles := int64(0)
		_ = filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {

			if filepath.Ext(path) == ".sst" {
				sstFiles++
			}
			if filepath.Ext(path) == ".vlog" {
				logFiles++
			}
			return nil
		})

		logrus.Infof("%s updated gauges vlog=%d sst=%d", time.Now(), logFiles, sstFiles)

	}

}

func (b *BadgerTest) write() {

	data := `{"randomstring":"6446D58D6DFAFD58586D3EA85A53F4A6B3CC057F933A22BB58E188A74AC8F663","refID":12495494,"testfield1234":"foo bar baz","date":"2018-01-01 12:00:00"}`
	batchSize := 1000
	rows := []writable{}
	var cnt uint64
	for {

		if stopWriting {
			logrus.Infof("%s stopping writing, let it catch up", time.Now())
			return
		}
		cnt++
		ts := time.Now().UnixNano()
		buf := make([]byte, 16)
		offset := 0
		binary.BigEndian.PutUint64(buf[offset:], uint64(ts))
		offset = offset + 8
		binary.BigEndian.PutUint64(buf[offset:], cnt)

		if !firstKeyStored {
			firstKey = buf
			firstKeyStored = true
			logrus.Infof("%s firstkey stored %x", time.Now(), firstKey)
		}
		w := writable{key: buf, value: []byte(data)}
		rows = append(rows, w)
		if len(rows) > batchSize {
			b.saveRows(rows)
			rows = []writable{}
		}
	}

}

func (b *BadgerTest) saveRows(rows []writable) {
	ttl := 30 * time.Minute

	txn := b.db.NewTransaction(true)

	for _, row := range rows {
		if err := txn.SetWithTTL(row.key, row.value, ttl); err == badger.ErrTxnTooBig {
			logrus.Infof("%s TX too big, committing...", time.Now())
			_ = txn.Commit(nil)
			txn = b.db.NewTransaction(true)
			err = txn.SetWithTTL(row.key, row.value, ttl)
		} else if err != nil && err != badger.ErrTxnTooBig {
			logrus.Errorf("unable to set key-val in transaction; %s", err)
		}
	}
	err := txn.Commit(nil)
	if err != nil {
		logrus.Errorf("unable to commit transaction; %s", err)
	}
}

func (b *BadgerTest) readKey() {
	// at some point our first key should be expired
	err := b.db.View(func(txn *badger.Txn) error {
		item, err := txn.Get([]byte(firstKey))
		if err != nil {
			return err
		}
		val, err := item.Value()
		if err != nil {
			return err
		}
		logrus.Infof("%s FIRSTKEY VALUE=%s", time.Now(), val)
		return nil
	})
	if err != nil {
		logrus.Errorf("%s FIRSTKEY unable to read first key from badger; %s", time.Now(), err)
	}
}

func (b *BadgerTest) badgerGC() {
	_, lastVlogSize := b.db.Size()
	logrus.Infof("lastVLOGSize=%v", lastVlogSize)
	ticker := time.NewTicker(2 * time.Minute)
	const GB = int64(1 << 30)
	for {
		select {
		case <-ticker.C:
			_, currentVlogSize := b.db.Size()
			if currentVlogSize < lastVlogSize+GB {
				continue
			}

			logrus.Infof("%s CLEANUP starting to purge keys", time.Now())
			err := b.db.PurgeOlderVersions()
			if err != nil {
				logrus.Errorf("%s badgerOps unable to purge older versions; %s", time.Now(), err)
			}
			logrus.Infof("%s CLEANUP PurgeOlderVersions completed", time.Now())

			// If size increased by 3.5 GB, then we run this 3 times.
			numTimes := (currentVlogSize - lastVlogSize) / GB
			for i := 0; i < int(numTimes); i++ {
				err := b.db.RunValueLogGC(0.5)
				if err != nil {
					logrus.Errorf("%s badgerOps unable to RunValueLogGC; %s", time.Now(), err)
				}
				logrus.Infof("%s CLEANUP RunValueLogGC completed iteration=%d", time.Now(), i)
			}
			_, lastVlogSize = b.db.Size()
		}
	}
}

jiminoc · 2018-04-28T05:25:06Z

I added a stop write boolean to see if eventually data would clean up. From that testing it appears vlog files will eventually be cleaned up from expired keys but sst tables stay forever. Also appears under heavy write load vlog cleanup will never catch up. Not sure if theres room for more concurrency there.

They key for me is finding a way to have consistent, heavy write loads 24/7 that can maintain a stable memory profile.

jiminoc · 2018-04-28T05:28:35Z

Other big observation... MAXTABLESIZE in the options acts funny. If I set that to 2GB what I'd expect to happen is that after a memtable flush a smallish level 0 file would be written maybe 50MB, then after 10 of those files they get compacted into larger, maybe 200MB files, and level 2, 500MB etc... e.g. each level has larger and larger SST files.

What I actually observe is that no matter what I set that value to, it writes about 70% of the maxtablesize at all levels. If I set that to 1GB it writes 700MB sst files at level 0.

manishrjain · 2018-04-28T05:40:29Z

Keys aren't really deleted from the LSM tree, they're only tombstoned (marked as deleted). I'll see if we can have a way by which the keys can be removed too, but that'd be beyond the scope of this issue.

jiminoc · 2018-04-28T05:45:34Z

Hrm. Sounds like a pretty big problem? Deletes should write tombstones but compaction should eventually physically remove those items from disk otherwise you can’t manage a growing dataset outside of sharding badger dbs. Should I file a separate issue?

manishrjain · 2018-04-28T06:00:22Z

Deleting a key in LSM tree isn't a common thing, exactly because of compactions. Say, you're compacting a key from L1->L2, a key which was deleted, so you just remove it altogether. An older version of that key might be present in L4, so all reads would now default to that, bringing that deleted key back.

Therefore, newer versions of keys are tombstoned, instead of removed in LSM trees. However, if there are alternative approach used by other LSM trees (say RocksDB), happy to take a look.

jiminoc · 2018-04-28T06:15:10Z

Quick reference
http://thelastpickle.com/blog/2016/07/27/about-deletes-and-tombstones.html

You have the bloom filters that let you know if a key is present anywhere else. I’ll dig through RocksDB tomorrow but I haven’t seen an lsm that doesn’t remove keys at some point. Otherwise you just fill disk.

jiminoc · 2018-04-28T16:09:00Z

https://github.com/google/leveldb/blob/master/doc/impl.md
“Compactions drop overwritten values. They also drop deletion markers if there are no higher numbered levels that contain a file whose range overlaps the current key.”

jiminoc · 2018-04-28T16:20:09Z

http://cidrdb.org/cidr2017/papers/p82-dong-cidr17.pdf RocksDB overview
“The merging process is called compaction because it re- moves data marked as deleted and data that has been over- written (if it is no longer needed).”

manishrjain · 2018-04-29T17:35:44Z

“The merging process is called compaction because it re- moves data marked as deleted and data that has been over- written (if it is no longer needed).” --RocksDB

This is what we're doing as well.

They also drop deletion markers if there are no higher numbered levels that contain a file whose range overlaps the current key. --LevelDB.

I tried implementing this, however, the multi-version nature of Badger keys makes this hard (probably also why RocksDB does not remove keys as well). I worked on a logic to remove deleted or expired keys with zero overlap from lower levels. https://github.com/dgraph-io/badger/tree/mrjn/removekeys

However, this still causes edge cases where a stale version of the key could resurrect after the removal of a tombstone.

Consider level L, the lowest level in the LSM tree (no overlap from below levels). Consider a key K, with many versions, spanning 2 or more sstables, T1 and T2. Say, T1 contains the latest versions of K, including the tombstone marker. This marker would cause all the older versions as discardable.

Now, say T1 gets compacted to L+1 first. We remove the tomstoned K, and all the versions present in T1. T1 moves to L+1, with no version of K. However, when a Get(K) is run, in the absence of any tombstone marker, it would read the latest version present of K, present in T2, and serve that. This would be wrong.

Compaction runs only one table at a time. We can't go across tables at the same level, to ensure that all versions of K are picked up together.

Instead, what you could do is to take a backup of Badger, and then reimport it. Or, we could have a tool which would only iterate over the LSM tree and rebuild it, without touching the value log -- this would be an offline operation, would reclaim LSM space and would be very fast.

Update: Another approach might be to ensure that a single key never spans multiple tables at the same level. That might fix this problem. I'll consider that further.

manishrjain · 2018-04-30T11:44:35Z

I've implemented the above-mentioned update and added the ability to discard all older versions starting from the first deleted or expired version of the key.

Can you try with the https://github.com/dgraph-io/badger/tree/mrjn/removekeys branch?

Update: Ran it with your code (plus modifications to limit number of keys to 10), and the LSM tree is working pretty nicely. After 2.75 hours and 100GB of data, this is the info log:
INFO[9900] 2018-04-30 07:15:48.142385376 -0700 PDT m=+9900.029221097 updated gauges vlog=101 sst=7

jiminoc · 2018-04-30T15:30:45Z

awesome, I'll give it a try today. I'd say your comment about no key spans multiple files at the same level is a good move and what other LSM implementations do as well (re: http://www.benstopford.com/2015/02/14/log-structured-merge-trees/ section: "Levelled Compaction")

I also came across a good post from influx that describes the situation as well and what they've tried TLDR: for DB's like badger, shard the actual DB by time, for RocksDB use column families with time blocks and delete the column family after a period of time has passed this efficiently removes the data.

https://docs.influxdata.com/influxdb/v0.9/concepts/storage_engine/

jiminoc · 2018-04-30T20:09:41Z

Tested with that branch. It looks like nothing got cleaned up in my case. Memory did drop though.

Test:
write for 15 minutes with a 20minute TTL on all keys. Wait 1+ hour after writes stop to observe files getting compacted away.

Result:
Memory went from 11GB in use to 4.4GB in use (good) however disk files stayed at 40VLOG files and 37 SST files for over 1 hour. No disk space reclaimed.

time="2018-04-30T20:04:29Z" level=error msg="github.com/Sirupsen/logrus/exported.go:142 - 2018-
time="2018-04-30T20:04:29Z" level=info msg="github.com/Sirupsen/logrus/exported.go:127 - 2018-04-30 20:04:29.327659602 +0000 UTC m=+5520.010805347 updated gauges vlog=40 sst=37"
time="2018-04-30T20:04:29Z" level=info msg="github.com/Sirupsen/logrus/exported.go:127 - 2018-04-30 20:04:29.327675831 +0000 UTC m=+5520.010821566 CLEANUP starting to purge keys size=41881148991"
time="2018-04-30T20:05:29Z" level=error msg="github.com/Sirupsen/logrus/exported.go:142 - 2018-
time="2018-04-30T20:05:29Z" level=info msg="github.com/Sirupsen/logrus/exported.go:127 - 2018-04-30 20:05:29.327647827 +0000 UTC m=+5580.010793481 updated gauges vlog=40 sst=37"
time="2018-04-30T20:05:50Z" level=info msg="github.com/Sirupsen/logrus/exported.go:127 - 2018-04-30 20:05:50.265244792 +0000 UTC m=+5600.948390535 CLEANUP PurgeOlderVersions completed"
time="2018-04-30T20:06:29Z" level=error msg="github.com/Sirupsen/logrus/exported.go:142 - 2018
time="2018-04-30T20:06:29Z" level=info msg="github.com/Sirupsen/logrus/exported.go:127 - 2018-04-30 20:06:29.327644038 +0000 UTC m=+5640.010789768 updated gauges vlog=40 sst=37"
time="2018-04-30T20:06:29Z" level=info msg="github.com/Sirupsen/logrus/exported.go:127 - 2018-04-30 20:06:29.327685198 +0000 UTC m=+5640.010830765 CLEANUP starting to purge keys size=41881148991"
time="2018-04-30T20:07:29Z" level=error msg="github.com/Sirupsen/logrus/exported.go:142 - 2018-
time="2018-04-30T20:07:29Z" level=info msg="github.com/Sirupsen/logrus/exported.go:127 - 2018-04-30 20:07:29.327662603 +0000 UTC m=+5700.010808304 updated gauges vlog=40 sst=37"
time="2018-04-30T20:07:50Z" level=info msg="github.com/Sirupsen/logrus/exported.go:127 - 2018-04-30 20:07:50.163478582 +0000 UTC m=+5720.846624315 CLEANUP PurgeOlderVersions completed"
time="2018-04-30T20:08:29Z" level=error msg="github.com/Sirupsen/logrus/exported.go:142 - 2018-
time="2018-04-30T20:08:29Z" level=info msg="github.com/Sirupsen/logrus/exported.go:127 - 2018-04-30 20:08:29.327650246 +0000 UTC m=+5760.010795930 updated gauges vlog=40 sst=37"
time="2018-04-30T20:08:29Z" level=info msg="github.com/Sirupsen/logrus/exported.go:127 - 2018-04-30 20:08:29.327690819 +0000 UTC m=+5760.010836425 CLEANUP starting to purge keys size=41881148991"

jiminoc · 2018-04-30T20:10:19Z

complete test code:
go build
./badger-test

package main

import (
	"github.com/dgraph-io/badger"
	"github.com/dgraph-io/badger/options"
	"time"
	"fmt"
	"encoding/binary"
	"path/filepath"
	"os"
	"github.com/Sirupsen/logrus"
	"encoding/json"
	_ "net/http/pprof"
	"net/http"
)

type writable struct {
	key   []byte
	value []byte
}

var (
	// store the first key we've seen so we can try and query it to see when it's expired from view
	firstKey       []byte
	firstKeyStored bool //
	stopWriting    = false
)

func main() {

	go func() {
		// you can hit:
		// go tool pprof http://localhost:8001/debug/pprof/heap
		// go tool pprof http://localhost:8001/debug/pprof/profile
		logrus.Infof("starting debug web server....")
		logrus.Info(http.ListenAndServe("localhost:8001", nil))
	}()
	done := make(chan struct{})
	bt := NewBadgerTest()
	bt.Start()

	<-done // wait

}

type BadgerTest struct {
	db *badger.DB
}

func NewBadgerTest() *BadgerTest {

	dir := "/raid0/badgertest"
	opts := badger.DefaultOptions
	opts.Dir = dir
	opts.ValueDir = dir
	opts.ValueLogLoadingMode = options.FileIO
	opts.TableLoadingMode = options.FileIO
	opts.NumCompactors = 20
	opts.MaxTableSize = .25 * 1073741824 // .25GB
	opts.NumMemtables = 2
	opts.ValueLogFileSize = 1 * 1073741824 // 2GB
	opts.SyncWrites = false

	bytes, _ := json.Marshal(&opts)
	logrus.Infof("BADGER OPTIONS=%s", string(bytes))

	db, err := badger.Open(opts)
	if err != nil {
		panic(fmt.Sprintf("unable to open badger db; %s", err))
	}
	bt := &BadgerTest{
		db: db,
	}

	go bt.filecounts(dir)
	return bt

}

func (b *BadgerTest) Start() {

	workers := 1
	for i := 0; i < workers; i++ {
		go b.write()
	}
	//go b.write()
	go b.badgerGC()

	go func() {
		tick := time.NewTicker(1 * time.Minute)
		for _ = range tick.C {
			b.readKey()
		}
	}()

	go func() {
		tick := time.NewTicker(15 * time.Minute)
		for _ = range tick.C {
			stopWriting = true
			return
		}
	}()

}
func (b *BadgerTest) Stop() {

	b.db.Close()
	logrus.Infof("%s shut down badger test", time.Now())
	time.Sleep(1 * time.Second)
}

func (b *BadgerTest) filecounts(dir string) {

	ticker := time.NewTicker(60 * time.Second)
	for _ = range ticker.C {

		logFiles := int64(0)
		sstFiles := int64(0)
		_ = filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {

			if filepath.Ext(path) == ".sst" {
				sstFiles++
			}
			if filepath.Ext(path) == ".vlog" {
				logFiles++
			}
			return nil
		})

		logrus.Infof("%s updated gauges vlog=%d sst=%d", time.Now(), logFiles, sstFiles)

	}

}

func (b *BadgerTest) write() {

	data := `{"randomstring":"6446D58D6DFAFD58586D3EA85A53F4A6B3CC057F933A22BB58E188A74AC8F663","refID":12495494,"testfield1234":"foo bar baz","date":"2018-01-01 12:00:00"}`
	batchSize := 1000
	rows := []writable{}
	var cnt uint64
	for {

		if stopWriting {
			logrus.Infof("%s stopping writing, let's see if compaction removes TTL'd data", time.Now())
			return
		}
		cnt++
		ts := time.Now().UnixNano()
		buf := make([]byte, 16)
		offset := 0
		binary.BigEndian.PutUint64(buf[offset:], uint64(ts))
		offset = offset + 8
		binary.BigEndian.PutUint64(buf[offset:], cnt)

		if !firstKeyStored {
			firstKey = buf
			firstKeyStored = true
			logrus.Infof("%s firstkey stored %x", time.Now(), firstKey)
		}
		w := writable{key: buf, value: []byte(data)}
		rows = append(rows, w)
		if len(rows) > batchSize {
			b.saveRows(rows)
			rows = []writable{}
		}
	}

}

func (b *BadgerTest) saveRows(rows []writable) {
	ttl := 20 * time.Minute

	txn := b.db.NewTransaction(true)

	for _, row := range rows {
		if err := txn.SetWithTTL(row.key, row.value, ttl); err == badger.ErrTxnTooBig {
			logrus.Infof("%s TX too big, committing...", time.Now())
			_ = txn.Commit(nil)
			txn = b.db.NewTransaction(true)
			err = txn.SetWithTTL(row.key, row.value, ttl)
		} else if err != nil && err != badger.ErrTxnTooBig {
			logrus.Errorf("unable to set key-val in transaction; %s", err)
		}
	}
	err := txn.Commit(nil)
	if err != nil {
		logrus.Errorf("unable to commit transaction; %s", err)
	}
}

func (b *BadgerTest) readKey() {
	// at some point our first key should be expired
	err := b.db.View(func(txn *badger.Txn) error {
		item, err := txn.Get([]byte(firstKey))
		if err != nil {
			return err
		}
		val, err := item.Value()
		if err != nil {
			return err
		}
		logrus.Infof("%s FIRSTKEY VALUE=%s", time.Now(), val)
		return nil
	})
	if err != nil {
		logrus.Errorf("%s FIRSTKEY unable to read first key from badger; %s", time.Now(), err)
	}
}

func (b *BadgerTest) badgerGC() {
	_, lastVlogSize := b.db.Size()
	logrus.Infof("lastVLOGSize=%v", lastVlogSize)
	ticker := time.NewTicker(2 * time.Minute)
	const GB = int64(1 << 30)
	for {
		select {
		case <-ticker.C:
			_, currentVlogSize := b.db.Size()
			//if currentVlogSize < lastVlogSize+GB {
			//	continue
			//}

			logrus.Infof("%s CLEANUP starting to purge keys size=%v", time.Now(), currentVlogSize)
			err := b.db.PurgeOlderVersions()
			if err != nil {
				logrus.Errorf("%s badgerOps unable to purge older versions; %s", time.Now(), err)
			}
			logrus.Infof("%s CLEANUP PurgeOlderVersions completed", time.Now())

			// If size increased by 3.5 GB, then we run this 3 times.
			numTimes := (currentVlogSize - lastVlogSize) / GB
			for i := 0; i < int(numTimes); i++ {
				err := b.db.RunValueLogGC(0.5)
				if err != nil {
					logrus.Errorf("%s badgerOps unable to RunValueLogGC; %s", time.Now(), err)
				}
				logrus.Infof("%s CLEANUP RunValueLogGC completed iteration=%d", time.Now(), i)
			}
			_, lastVlogSize = b.db.Size()
		}
	}
}

manishrjain · 2018-04-30T23:03:41Z

Compaction wouldn't happen if it is not required. That means if you're not writing new things, and causing the levels to overflow, they won't automatically compact away. And therefore, the SST files won't have a chance to discard keys as they get expired.

However, as keys are getting expired, value log GC would still churn away on them and cause value logs to be discarded/moved etc.

Note that if you do close the DB, it would cause at least L0 to be fully compacted away.

P.S. For code samples, instead of putting them here, can you put them in a gist? Thanks!

Update: During my close to 3hrs run, with 90s TTL, the sst files were reduced to a few KBs at L1 (only 10 keys, with many versions), with the value logs hovering close to 100G. You can also see compaction skipping keys and versions via /debug/requests (and more info at /debug/events).

jiminoc · 2018-04-30T23:04:52Z

yea, I assumed that so I'm reworking the test now to do a continuous write at a slower pace to see if it maintains a consistent memory/disk profile. will have some data shortly, thanks again.

- In response to #464 . - If a key version has deletion marker or is expired, we can safely drop all the older versions of that key. - If there's no overlap from lower levels, we can even drop the first such version. - To avoid an edge case bug, we need to ensure that all versions of the same key are contained by the same SSTable. This is reflected by not closing a table as long as more versions are found. And by picking key ranges to include all versions of the keys. Both these measures ensure that all versions at the same level get compacted together. - To avoid another edge case bug, we need to ensure that we don't drop versions above the current `readTs`. Note that managed DB would therefore not result in any version being discarded. Handling that is an open problem. Badger Info: - Print out the key ranges for each SSTable via `badger info`. - Open an available port when running `badger` tool, so one can view the `/debug/request` and `/debug/events` links to understand what's going on behind the scenes.

manishrjain · 2018-05-01T03:35:09Z

Submitted as e597fb7. Based on my testing, this PR works. Would be good to hear from your end as well, considering you're running tests for a lot longer.

Btw, modified your code a bit to allow ctrl+c to be captured to close the DB gracefully, and a couple other changes. You can see my version here: https://gist.github.com/manishrjain/137b9849f2672a5fc3a106815d9d41d7

jiminoc · 2018-05-01T16:16:30Z

were there any changes in that commit from the previous branch /removekeys that I was testing against?

I did a 12 hour soak test and the good news is that memory stayed pretty constant (I slowed writes down to give compaction a chance to keep up in my test) but the load and disk usage steadily climbed of that time
full graph of 12 hour test https://imgur.com/a/6ZXstq8

disk usage steadily climbing
https://imgur.com/a/xSUcnL2

I'll pull latest and do another long running test with your gist above. thx again Manish!

edit: going to change the test so that it writes at full speed for 15 minutes, then slows down writes to observe space reclamation while writes are still coming in to kick off compactions.

manishrjain · 2018-05-01T20:01:54Z

I think you could try running value log GC more often -- most likely it's falling behind in reclaiming space as fast as writes are coming in.

jiminoc · 2018-05-01T20:09:34Z

using the latest from master is looking much better. You can clearly see where space was reclaimed.
https://imgur.com/a/w41GlaU

Test: Write as fast as you can for 15 minutes with 15 minute TTL'd keys (200K per second+) then slow down to 10-25K per second to give compaction time to catch up.

it's on an uptrend right now, so I'm going to let it run for a while longer to see if we get to a steady state or if it will continue to fall behind. I'll then retest with a 30 second ticker for running valuelog GC

manishrjain · 2018-05-01T21:11:18Z

Another way to figure out when to run value log GC is to do it every time the size of value log increases by a gig or so (in addition to running it periodically). You can use this function to get the sizes:

badger/db.go

Line 1108 in e1ead00

func (db *DB) Size() (lsm int64, vlog int64) {

jiminoc · 2018-05-02T16:21:14Z

boom! 12+ hour soak test looked good. Memory remained fairly constant and disk space was fairly constant. I may have a couple other things I'll fire off in new tickets. Thanks again Manish, this is a big win!

jiminoc mentioned this issue Apr 25, 2018

PurgeOldVersions+RunValueLogGC does not seem to still work for me immediately #444

Closed

manishrjain added the investigate label Apr 25, 2018

manishrjain self-assigned this Apr 28, 2018

manishrjain added kind/enhancement Something could be better. and removed investigate labels Apr 30, 2018

manishrjain changed the title ~~Badger TTLs not working, no leveling, no GC - OOM city~~ Discard invalid versions of keys during compaction Apr 30, 2018

This was referenced Apr 30, 2018

How to deal with performance issues? #459

Closed

How to Check if key exists after set ttl expired #456

Closed

manishrjain mentioned this issue May 1, 2018

Discard key versions during compaction #471

Closed

jiminoc closed this as completed May 2, 2018

SwarnimRaj mentioned this issue Nov 20, 2018

Sorted key/value store (badger) backed storage plugin jaegertracing/jaeger#760

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Discard invalid versions of keys during compaction #464

Discard invalid versions of keys during compaction #464

jiminoc commented Apr 24, 2018 •

edited

Loading

jiminoc commented Apr 25, 2018

jiminoc commented Apr 26, 2018

jiminoc commented Apr 27, 2018

manishrjain commented Apr 28, 2018

jiminoc commented Apr 28, 2018

jiminoc commented Apr 28, 2018

jiminoc commented Apr 28, 2018

manishrjain commented Apr 28, 2018

jiminoc commented Apr 28, 2018

manishrjain commented Apr 28, 2018

jiminoc commented Apr 28, 2018

jiminoc commented Apr 28, 2018

jiminoc commented Apr 28, 2018

manishrjain commented Apr 29, 2018 •

edited

Loading

manishrjain commented Apr 30, 2018 •

edited

Loading

jiminoc commented Apr 30, 2018 •

edited

Loading

jiminoc commented Apr 30, 2018 •

edited

Loading

jiminoc commented Apr 30, 2018

manishrjain commented Apr 30, 2018 •

edited

Loading

jiminoc commented Apr 30, 2018

manishrjain commented May 1, 2018

jiminoc commented May 1, 2018 •

edited

Loading

manishrjain commented May 1, 2018 •

edited

Loading

jiminoc commented May 1, 2018

manishrjain commented May 1, 2018 •

edited

Loading

jiminoc commented May 2, 2018

Discard invalid versions of keys during compaction #464

Discard invalid versions of keys during compaction #464

Comments

jiminoc commented Apr 24, 2018 • edited Loading

jiminoc commented Apr 25, 2018

jiminoc commented Apr 26, 2018

jiminoc commented Apr 27, 2018

manishrjain commented Apr 28, 2018

jiminoc commented Apr 28, 2018

jiminoc commented Apr 28, 2018

jiminoc commented Apr 28, 2018

manishrjain commented Apr 28, 2018

jiminoc commented Apr 28, 2018

manishrjain commented Apr 28, 2018

jiminoc commented Apr 28, 2018

jiminoc commented Apr 28, 2018

jiminoc commented Apr 28, 2018

manishrjain commented Apr 29, 2018 • edited Loading

manishrjain commented Apr 30, 2018 • edited Loading

jiminoc commented Apr 30, 2018 • edited Loading

jiminoc commented Apr 30, 2018 • edited Loading

jiminoc commented Apr 30, 2018

manishrjain commented Apr 30, 2018 • edited Loading

jiminoc commented Apr 30, 2018

manishrjain commented May 1, 2018

jiminoc commented May 1, 2018 • edited Loading

manishrjain commented May 1, 2018 • edited Loading

jiminoc commented May 1, 2018

manishrjain commented May 1, 2018 • edited Loading

jiminoc commented May 2, 2018

jiminoc commented Apr 24, 2018 •

edited

Loading

manishrjain commented Apr 29, 2018 •

edited

Loading

manishrjain commented Apr 30, 2018 •

edited

Loading

jiminoc commented Apr 30, 2018 •

edited

Loading

jiminoc commented Apr 30, 2018 •

edited

Loading

manishrjain commented Apr 30, 2018 •

edited

Loading

jiminoc commented May 1, 2018 •

edited

Loading

manishrjain commented May 1, 2018 •

edited

Loading

manishrjain commented May 1, 2018 •

edited

Loading