From 4e56d97ae6dd27b7576cafcbcc4b77bde6e195ae Mon Sep 17 00:00:00 2001 From: Joshua Goldstein Date: Sat, 18 Feb 2023 11:55:12 -0600 Subject: [PATCH 1/6] feat(bench): add tool pick table benchmark (#1699) Add benchmark tool for picktable benchmarking. (cherry picked from commit ca80206d2c0c869560d5b9cfdcab0307c807a54c) --- badger/cmd/bench.go | 4 +- badger/cmd/pick_table_bench.go | 225 +++++++++++++++++++++++++++++++++ badger/cmd/read_bench.go | 5 +- badger/cmd/write_bench.go | 2 +- 4 files changed, 231 insertions(+), 5 deletions(-) create mode 100644 badger/cmd/pick_table_bench.go diff --git a/badger/cmd/bench.go b/badger/cmd/bench.go index 2ec2afe59..2cd4926df 100644 --- a/badger/cmd/bench.go +++ b/badger/cmd/bench.go @@ -23,8 +23,8 @@ import ( var benchCmd = &cobra.Command{ Use: "benchmark", Short: "Benchmark Badger database.", - Long: `This command will benchmark Badger for different usecases. Currently only read benchmark - is supported. Useful for testing and performance analysis.`, + Long: `This command will benchmark Badger for different usecases. + Useful for testing and performance analysis.`, } func init() { diff --git a/badger/cmd/pick_table_bench.go b/badger/cmd/pick_table_bench.go new file mode 100644 index 000000000..07a75eb25 --- /dev/null +++ b/badger/cmd/pick_table_bench.go @@ -0,0 +1,225 @@ +/* + * Copyright 2021 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cmd + +import ( + "bytes" + "fmt" + "os" + "runtime/pprof" + "sort" + "testing" + + "github.com/dgraph-io/badger/v3" + "github.com/dgraph-io/badger/v3/options" + "github.com/dgraph-io/badger/v3/table" + "github.com/dgraph-io/badger/v3/y" + "github.com/spf13/cobra" +) + +var pickBenchCmd = &cobra.Command{ + Use: "picktable", + Short: "Benchmark pick tables.", + Long: `This command simulates pickTables used in iterators.`, + RunE: pickTableBench, +} + +var ( + pickOpts = struct { + readOnly bool + sampleSize int + cpuprofile string + }{} + keys [][]byte + handler levelHandler +) + +func init() { + benchCmd.AddCommand(pickBenchCmd) + pickBenchCmd.Flags().BoolVar( + &pickOpts.readOnly, "read-only", true, "If true, DB will be opened in read only mode.") + pickBenchCmd.Flags().IntVar( + &pickOpts.sampleSize, "sample-size", 1000000, "Sample size of keys to be used for lookup.") + pickBenchCmd.Flags().StringVar( + &pickOpts.cpuprofile, "cpuprofile", "", "Write CPU profile to file.") +} + +func pickTableBench(cmd *cobra.Command, args []string) error { + opt := badger.DefaultOptions(sstDir). + WithValueDir(vlogDir). + WithReadOnly(pickOpts.readOnly) + fmt.Printf("Opening badger with options = %+v\n", opt) + db, err := badger.OpenManaged(opt) + if err != nil { + return y.Wrapf(err, "unable to open DB") + } + defer db.Close() + + boundaries := getBoundaries(db) + tables := genTables(boundaries) + handler.init(tables) + keys, err = getSampleKeys(db, pickOpts.sampleSize) + y.Check(err) + fmt.Println("Running benchmark...") + fmt.Println("***** BenchmarkPickTables *****") + fmt.Println(testing.Benchmark(BenchmarkPickTables)) + fmt.Println("*******************************") + return nil +} + +func BenchmarkPickTables(b *testing.B) { + if len(pickOpts.cpuprofile) > 0 { + f, err := os.Create(pickOpts.cpuprofile) + y.Check(err) + pprof.StartCPUProfile(f) + defer pprof.StopCPUProfile() + } + b.ResetTimer() + iopts := iteratorOptions{prefixIsKey: true} + for i := 0; i < b.N; i++ { + for _, key := range keys { + iopts.Prefix = key + _ = handler.pickTables(iopts) + } + } +} + +// See badger.IteratorOptions (iterator.go) +type iteratorOptions struct { + prefixIsKey bool // If set, use the prefix for bloom filter lookup. + Prefix []byte // Only iterate over this given prefix. + SinceTs uint64 // Only read data that has version > SinceTs. +} + +// See compareToPrefix in iterator.go +func (opt *iteratorOptions) compareToPrefix(key []byte) int { + // We should compare key without timestamp. For example key - a[TS] might be > "aa" prefix. + key = y.ParseKey(key) + if len(key) > len(opt.Prefix) { + key = key[:len(opt.Prefix)] + } + return bytes.Compare(key, opt.Prefix) +} + +// See levelHandler in level_handler.go +type levelHandler struct { + tables []*table.Table +} + +func (s *levelHandler) init(tables []*table.Table) { + fmt.Println("Initializing level handler...") + s.tables = tables +} + +// This implementation is based on the implementation in master branch. +func (s *levelHandler) pickTables(opt iteratorOptions) []*table.Table { + filterTables := func(tables []*table.Table) []*table.Table { + if opt.SinceTs > 0 { + tmp := tables[:0] + for _, t := range tables { + if t.MaxVersion() < opt.SinceTs { + continue + } + tmp = append(tmp, t) + } + tables = tmp + } + return tables + } + + all := s.tables + if len(opt.Prefix) == 0 { + out := make([]*table.Table, len(all)) + copy(out, all) + return filterTables(out) + } + sIdx := sort.Search(len(all), func(i int) bool { + // table.Biggest >= opt.prefix + // if opt.Prefix < table.Biggest, then surely it is not in any of the preceding tables. + return opt.compareToPrefix(all[i].Biggest()) >= 0 + }) + if sIdx == len(all) { + // Not found. + return []*table.Table{} + } + + filtered := all[sIdx:] + if !opt.prefixIsKey { + eIdx := sort.Search(len(filtered), func(i int) bool { + return opt.compareToPrefix(filtered[i].Smallest()) > 0 + }) + out := make([]*table.Table, len(filtered[:eIdx])) + copy(out, filtered[:eIdx]) + return filterTables(out) + } + + // opt.prefixIsKey == true. This code is optimizing for opt.prefixIsKey part. + var out []*table.Table + // hash := y.Hash(opt.Prefix) + for _, t := range filtered { + // When we encounter the first table whose smallest key is higher than opt.Prefix, we can + // stop. This is an IMPORTANT optimization, just considering how often we call + // NewKeyIterator. + if opt.compareToPrefix(t.Smallest()) > 0 { + // if table.Smallest > opt.Prefix, then this and all tables after this can be ignored. + break + } + out = append(out, t) + } + return filterTables(out) +} + +// Sorts the boundaries and creates mock table out of them. +func genTables(boundaries [][]byte) []*table.Table { + buildTable := func(k1, k2 []byte) *table.Table { + opts := table.Options{ + ChkMode: options.NoVerification, + } + b := table.NewTableBuilder(opts) + defer b.Close() + // Add one key so that we can open this table. + b.Add(y.KeyWithTs(k1, 1), y.ValueStruct{}, 0) + b.Add(y.KeyWithTs(k2, 1), y.ValueStruct{}, 0) + tab, err := table.OpenInMemoryTable(b.Finish(), 0, &opts) + y.Check(err) + return tab + } + + sort.Slice(boundaries, func(i, j int) bool { + return bytes.Compare(boundaries[i], boundaries[j]) < 0 + }) + out := make([]*table.Table, 0, len(boundaries)) + for i := range boundaries { + var j int + if i != 0 { + j = i - 1 + } + out = append(out, buildTable(boundaries[i], boundaries[j])) + } + fmt.Printf("Created %d mock tables.\n", len(out)) + return out +} + +func getBoundaries(db *badger.DB) [][]byte { + fmt.Println("Getting the table boundaries...") + tables := db.Tables() + out := make([][]byte, 0, 2*len(tables)) + for _, t := range tables { + out = append(out, t.Left, t.Right) + } + return out +} diff --git a/badger/cmd/read_bench.go b/badger/cmd/read_bench.go index 0e8cb1fea..b3f1698f0 100644 --- a/badger/cmd/read_bench.go +++ b/badger/cmd/read_bench.go @@ -170,6 +170,7 @@ func lookupForKey(db *badger.DB, key []byte) (sz uint64) { err := db.View(func(txn *badger.Txn) error { iopt := badger.DefaultIteratorOptions iopt.AllVersions = true + iopt.PrefetchValues = false it := txn.NewKeyIterator(key, iopt) defer it.Close() @@ -189,7 +190,7 @@ func lookupForKey(db *badger.DB, key []byte) (sz uint64) { } // getSampleKeys uses stream framework internally, to get keys in random order. -func getSampleKeys(db *badger.DB) ([][]byte, error) { +func getSampleKeys(db *badger.DB, sampleSize int) ([][]byte, error) { var keys [][]byte count := 0 stream := db.NewStreamAt(math.MaxUint64) @@ -218,7 +219,7 @@ func getSampleKeys(db *badger.DB) ([][]byte, error) { } keys = append(keys, kv.Key) count++ - if count >= ro.sampleSize { + if count >= sampleSize { cancel() return errStop } diff --git a/badger/cmd/write_bench.go b/badger/cmd/write_bench.go index b1865e1b1..c842b915d 100644 --- a/badger/cmd/write_bench.go +++ b/badger/cmd/write_bench.go @@ -169,7 +169,7 @@ func writeRandom(db *badger.DB, num uint64) error { func readTest(db *badger.DB, dur time.Duration) { now := time.Now() - keys, err := getSampleKeys(db) + keys, err := getSampleKeys(db, ro.sampleSize) if err != nil { panic(err) } From 6b0349a3382aadda6bd8905a3dc86b2b3295b753 Mon Sep 17 00:00:00 2001 From: Joshua Goldstein Date: Sat, 18 Feb 2023 11:55:36 -0600 Subject: [PATCH 2/6] fix(benchmarks): use uint32 in filename generation (#1741) Note that the file ID can be 32 bit only, while the benchmarks were using 63-bit number for the same. (cherry picked from commit 292a4be727ec673ef14a1843e852f5187cf649f4) --- manifest_test.go | 2 +- table/table_test.go | 6 +++--- value_test.go | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/manifest_test.go b/manifest_test.go index c48850603..4d8ecc5a3 100644 --- a/manifest_test.go +++ b/manifest_test.go @@ -139,7 +139,7 @@ func buildTable(t *testing.T, keyValues [][]string, bopts table.Options) *table. defer b.Close() // TODO: Add test for file garbage collection here. No files should be left after the tests here. - filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Int63()) + filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Uint32()) sort.Slice(keyValues, func(i, j int) bool { return keyValues[i][0] < keyValues[j][0] diff --git a/table/table_test.go b/table/table_test.go index 925f17eab..ab2cca589 100644 --- a/table/table_test.go +++ b/table/table_test.go @@ -652,7 +652,7 @@ func TestTableBigValues(t *testing.T) { builder.Add(key, vs, 0) } - filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Int63()) + filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Uint32()) tbl, err := CreateTable(filename, builder) require.NoError(t, err, "unable to open table") defer tbl.DecrRef() @@ -754,7 +754,7 @@ func BenchmarkReadMerged(b *testing.B) { require.NoError(b, err) for i := 0; i < m; i++ { - filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Int63()) + filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Uint32()) opts := Options{Compression: options.ZSTD, BlockSize: 4 * 1024, BloomFalsePositive: 0.01} opts.BlockCache = cache builder := NewTableBuilder(opts) @@ -848,7 +848,7 @@ func getTableForBenchmarks(b *testing.B, count int, cache *ristretto.Cache) *Tab opts.BlockCache = cache builder := NewTableBuilder(opts) defer builder.Close() - filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Int63()) + filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Uint32()) for i := 0; i < count; i++ { k := fmt.Sprintf("%016x", i) v := fmt.Sprintf("%d", i) diff --git a/value_test.go b/value_test.go index a2e95dd66..38f58f2ac 100644 --- a/value_test.go +++ b/value_test.go @@ -963,8 +963,9 @@ func BenchmarkReadWrite(b *testing.B) { dir, err := ioutil.TempDir("", "vlog-benchmark") y.Check(err) defer removeDir(dir) - - db, err := Open(getTestOptions(dir)) + opts := getTestOptions(dir) + opts.ValueThreshold = 0 + db, err := Open(opts) y.Check(err) vl := &db.vlog From d975055f72e660f57a1415ca50691b8179940ad9 Mon Sep 17 00:00:00 2001 From: Joshua Goldstein Date: Mon, 20 Feb 2023 18:41:12 -0600 Subject: [PATCH 3/6] check error --- badger/cmd/pick_table_bench.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/badger/cmd/pick_table_bench.go b/badger/cmd/pick_table_bench.go index 07a75eb25..63fe71fbe 100644 --- a/badger/cmd/pick_table_bench.go +++ b/badger/cmd/pick_table_bench.go @@ -85,7 +85,8 @@ func BenchmarkPickTables(b *testing.B) { if len(pickOpts.cpuprofile) > 0 { f, err := os.Create(pickOpts.cpuprofile) y.Check(err) - pprof.StartCPUProfile(f) + err = pprof.StartCPUProfile(f) + y.Check(err) defer pprof.StopCPUProfile() } b.ResetTimer() From 87d132d9f1763eb9a9f52e430648a5bddc1d0021 Mon Sep 17 00:00:00 2001 From: Joshua Goldstein Date: Mon, 20 Feb 2023 18:48:52 -0600 Subject: [PATCH 4/6] check error --- badger/cmd/pick_table_bench.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/badger/cmd/pick_table_bench.go b/badger/cmd/pick_table_bench.go index 63fe71fbe..fc8227469 100644 --- a/badger/cmd/pick_table_bench.go +++ b/badger/cmd/pick_table_bench.go @@ -67,7 +67,9 @@ func pickTableBench(cmd *cobra.Command, args []string) error { if err != nil { return y.Wrapf(err, "unable to open DB") } - defer db.Close() + defer func() { + y.Check(db.Close()) + }() boundaries := getBoundaries(db) tables := genTables(boundaries) From ac90d5b5516bc066202a17ed7db97bb0b2c8de06 Mon Sep 17 00:00:00 2001 From: Joshua Goldstein Date: Mon, 20 Feb 2023 21:20:25 -0600 Subject: [PATCH 5/6] import order --- badger/cmd/pick_table_bench.go | 1 + 1 file changed, 1 insertion(+) diff --git a/badger/cmd/pick_table_bench.go b/badger/cmd/pick_table_bench.go index fc8227469..a5be19ee2 100644 --- a/badger/cmd/pick_table_bench.go +++ b/badger/cmd/pick_table_bench.go @@ -28,6 +28,7 @@ import ( "github.com/dgraph-io/badger/v3/options" "github.com/dgraph-io/badger/v3/table" "github.com/dgraph-io/badger/v3/y" + "github.com/spf13/cobra" ) From 885b73dfa571415e3b1ae5d47c48ea2372bae88c Mon Sep 17 00:00:00 2001 From: Joshua Goldstein Date: Wed, 22 Feb 2023 20:46:15 -0600 Subject: [PATCH 6/6] reverse import orders --- badger/cmd/pick_table_bench.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/badger/cmd/pick_table_bench.go b/badger/cmd/pick_table_bench.go index a5be19ee2..af26bdbd2 100644 --- a/badger/cmd/pick_table_bench.go +++ b/badger/cmd/pick_table_bench.go @@ -24,12 +24,12 @@ import ( "sort" "testing" + "github.com/spf13/cobra" + "github.com/dgraph-io/badger/v3" "github.com/dgraph-io/badger/v3/options" "github.com/dgraph-io/badger/v3/table" "github.com/dgraph-io/badger/v3/y" - - "github.com/spf13/cobra" ) var pickBenchCmd = &cobra.Command{