fix(bench): bring in benchmark fixes from main (#1863)

Cherry pick of #1699 and #1741.
hypermodeinc · Feb 23, 2023 · 4a3b224 · 4a3b224
1 parent 1dce1d4
commit 4a3b224
Show file tree

Hide file tree

Showing 7 changed files with 242 additions and 11 deletions.
diff --git a/badger/cmd/bench.go b/badger/cmd/bench.go
@@ -23,8 +23,8 @@ import (
 var benchCmd = &cobra.Command{
 	Use:   "benchmark",
 	Short: "Benchmark Badger database.",
-	Long: `This command will benchmark Badger for different usecases. Currently only read benchmark
-	is supported. Useful for testing and performance analysis.`,
+	Long: `This command will benchmark Badger for different usecases. 
+	Useful for testing and performance analysis.`,
 }
 
 func init() {

diff --git a/badger/cmd/pick_table_bench.go b/badger/cmd/pick_table_bench.go
@@ -0,0 +1,229 @@
+/*
+ * Copyright 2021 Dgraph Labs, Inc. and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cmd
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"runtime/pprof"
+	"sort"
+	"testing"
+
+	"github.com/spf13/cobra"
+
+	"github.com/dgraph-io/badger/v3"
+	"github.com/dgraph-io/badger/v3/options"
+	"github.com/dgraph-io/badger/v3/table"
+	"github.com/dgraph-io/badger/v3/y"
+)
+
+var pickBenchCmd = &cobra.Command{
+	Use:   "picktable",
+	Short: "Benchmark pick tables.",
+	Long:  `This command simulates pickTables used in iterators.`,
+	RunE:  pickTableBench,
+}
+
+var (
+	pickOpts = struct {
+		readOnly   bool
+		sampleSize int
+		cpuprofile string
+	}{}
+	keys    [][]byte
+	handler levelHandler
+)
+
+func init() {
+	benchCmd.AddCommand(pickBenchCmd)
+	pickBenchCmd.Flags().BoolVar(
+		&pickOpts.readOnly, "read-only", true, "If true, DB will be opened in read only mode.")
+	pickBenchCmd.Flags().IntVar(
+		&pickOpts.sampleSize, "sample-size", 1000000, "Sample size of keys to be used for lookup.")
+	pickBenchCmd.Flags().StringVar(
+		&pickOpts.cpuprofile, "cpuprofile", "", "Write CPU profile to file.")
+}
+
+func pickTableBench(cmd *cobra.Command, args []string) error {
+	opt := badger.DefaultOptions(sstDir).
+		WithValueDir(vlogDir).
+		WithReadOnly(pickOpts.readOnly)
+	fmt.Printf("Opening badger with options = %+v\n", opt)
+	db, err := badger.OpenManaged(opt)
+	if err != nil {
+		return y.Wrapf(err, "unable to open DB")
+	}
+	defer func() {
+		y.Check(db.Close())
+	}()
+
+	boundaries := getBoundaries(db)
+	tables := genTables(boundaries)
+	handler.init(tables)
+	keys, err = getSampleKeys(db, pickOpts.sampleSize)
+	y.Check(err)
+	fmt.Println("Running benchmark...")
+	fmt.Println("***** BenchmarkPickTables *****")
+	fmt.Println(testing.Benchmark(BenchmarkPickTables))
+	fmt.Println("*******************************")
+	return nil
+}
+
+func BenchmarkPickTables(b *testing.B) {
+	if len(pickOpts.cpuprofile) > 0 {
+		f, err := os.Create(pickOpts.cpuprofile)
+		y.Check(err)
+		err = pprof.StartCPUProfile(f)
+		y.Check(err)
+		defer pprof.StopCPUProfile()
+	}
+	b.ResetTimer()
+	iopts := iteratorOptions{prefixIsKey: true}
+	for i := 0; i < b.N; i++ {
+		for _, key := range keys {
+			iopts.Prefix = key
+			_ = handler.pickTables(iopts)
+		}
+	}
+}
+
+// See badger.IteratorOptions (iterator.go)
+type iteratorOptions struct {
+	prefixIsKey bool   // If set, use the prefix for bloom filter lookup.
+	Prefix      []byte // Only iterate over this given prefix.
+	SinceTs     uint64 // Only read data that has version > SinceTs.
+}
+
+// See compareToPrefix in iterator.go
+func (opt *iteratorOptions) compareToPrefix(key []byte) int {
+	// We should compare key without timestamp. For example key - a[TS] might be > "aa" prefix.
+	key = y.ParseKey(key)
+	if len(key) > len(opt.Prefix) {
+		key = key[:len(opt.Prefix)]
+	}
+	return bytes.Compare(key, opt.Prefix)
+}
+
+// See levelHandler in level_handler.go
+type levelHandler struct {
+	tables []*table.Table
+}
+
+func (s *levelHandler) init(tables []*table.Table) {
+	fmt.Println("Initializing level handler...")
+	s.tables = tables
+}
+
+// This implementation is based on the implementation in master branch.
+func (s *levelHandler) pickTables(opt iteratorOptions) []*table.Table {
+	filterTables := func(tables []*table.Table) []*table.Table {
+		if opt.SinceTs > 0 {
+			tmp := tables[:0]
+			for _, t := range tables {
+				if t.MaxVersion() < opt.SinceTs {
+					continue
+				}
+				tmp = append(tmp, t)
+			}
+			tables = tmp
+		}
+		return tables
+	}
+
+	all := s.tables
+	if len(opt.Prefix) == 0 {
+		out := make([]*table.Table, len(all))
+		copy(out, all)
+		return filterTables(out)
+	}
+	sIdx := sort.Search(len(all), func(i int) bool {
+		// table.Biggest >= opt.prefix
+		// if opt.Prefix < table.Biggest, then surely it is not in any of the preceding tables.
+		return opt.compareToPrefix(all[i].Biggest()) >= 0
+	})
+	if sIdx == len(all) {
+		// Not found.
+		return []*table.Table{}
+	}
+
+	filtered := all[sIdx:]
+	if !opt.prefixIsKey {
+		eIdx := sort.Search(len(filtered), func(i int) bool {
+			return opt.compareToPrefix(filtered[i].Smallest()) > 0
+		})
+		out := make([]*table.Table, len(filtered[:eIdx]))
+		copy(out, filtered[:eIdx])
+		return filterTables(out)
+	}
+
+	// opt.prefixIsKey == true. This code is optimizing for opt.prefixIsKey part.
+	var out []*table.Table
+	// hash := y.Hash(opt.Prefix)
+	for _, t := range filtered {
+		// When we encounter the first table whose smallest key is higher than opt.Prefix, we can
+		// stop. This is an IMPORTANT optimization, just considering how often we call
+		// NewKeyIterator.
+		if opt.compareToPrefix(t.Smallest()) > 0 {
+			// if table.Smallest > opt.Prefix, then this and all tables after this can be ignored.
+			break
+		}
+		out = append(out, t)
+	}
+	return filterTables(out)
+}
+
+// Sorts the boundaries and creates mock table out of them.
+func genTables(boundaries [][]byte) []*table.Table {
+	buildTable := func(k1, k2 []byte) *table.Table {
+		opts := table.Options{
+			ChkMode: options.NoVerification,
+		}
+		b := table.NewTableBuilder(opts)
+		defer b.Close()
+		// Add one key so that we can open this table.
+		b.Add(y.KeyWithTs(k1, 1), y.ValueStruct{}, 0)
+		b.Add(y.KeyWithTs(k2, 1), y.ValueStruct{}, 0)
+		tab, err := table.OpenInMemoryTable(b.Finish(), 0, &opts)
+		y.Check(err)
+		return tab
+	}
+
+	sort.Slice(boundaries, func(i, j int) bool {
+		return bytes.Compare(boundaries[i], boundaries[j]) < 0
+	})
+	out := make([]*table.Table, 0, len(boundaries))
+	for i := range boundaries {
+		var j int
+		if i != 0 {
+			j = i - 1
+		}
+		out = append(out, buildTable(boundaries[i], boundaries[j]))
+	}
+	fmt.Printf("Created %d mock tables.\n", len(out))
+	return out
+}
+
+func getBoundaries(db *badger.DB) [][]byte {
+	fmt.Println("Getting the table boundaries...")
+	tables := db.Tables()
+	out := make([][]byte, 0, 2*len(tables))
+	for _, t := range tables {
+		out = append(out, t.Left, t.Right)
+	}
+	return out
+}
diff --git a/badger/cmd/read_bench.go b/badger/cmd/read_bench.go
@@ -170,6 +170,7 @@ func lookupForKey(db *badger.DB, key []byte) (sz uint64) {
 	err := db.View(func(txn *badger.Txn) error {
 		iopt := badger.DefaultIteratorOptions
 		iopt.AllVersions = true
+		iopt.PrefetchValues = false
 		it := txn.NewKeyIterator(key, iopt)
 		defer it.Close()
 
@@ -189,7 +190,7 @@ func lookupForKey(db *badger.DB, key []byte) (sz uint64) {
 }
 
 // getSampleKeys uses stream framework internally, to get keys in random order.
-func getSampleKeys(db *badger.DB) ([][]byte, error) {
+func getSampleKeys(db *badger.DB, sampleSize int) ([][]byte, error) {
 	var keys [][]byte
 	count := 0
 	stream := db.NewStreamAt(math.MaxUint64)
@@ -218,7 +219,7 @@ func getSampleKeys(db *badger.DB) ([][]byte, error) {
 			}
 			keys = append(keys, kv.Key)
 			count++
-			if count >= ro.sampleSize {
+			if count >= sampleSize {
 				cancel()
 				return errStop
 			}

diff --git a/badger/cmd/write_bench.go b/badger/cmd/write_bench.go
@@ -169,7 +169,7 @@ func writeRandom(db *badger.DB, num uint64) error {
 
 func readTest(db *badger.DB, dur time.Duration) {
 	now := time.Now()
-	keys, err := getSampleKeys(db)
+	keys, err := getSampleKeys(db, ro.sampleSize)
 	if err != nil {
 		panic(err)
 	}

diff --git a/manifest_test.go b/manifest_test.go
@@ -127,7 +127,7 @@ func buildTable(t *testing.T, keyValues [][]string, bopts table.Options) *table.
 	defer b.Close()
 	// TODO: Add test for file garbage collection here. No files should be left after the tests here.
 
-	filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Int63())
+	filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Uint32())
 
 	sort.Slice(keyValues, func(i, j int) bool {
 		return keyValues[i][0] < keyValues[j][0]

diff --git a/table/table_test.go b/table/table_test.go
@@ -653,7 +653,7 @@ func TestTableBigValues(t *testing.T) {
 		builder.Add(key, vs, 0)
 	}
 
-	filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Int63())
+	filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Uint32())
 	tbl, err := CreateTable(filename, builder)
 	require.NoError(t, err, "unable to open table")
 	defer func() { require.NoError(t, tbl.DecrRef()) }()
@@ -755,7 +755,7 @@ func BenchmarkReadMerged(b *testing.B) {
 	require.NoError(b, err)
 
 	for i := 0; i < m; i++ {
-		filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Int63())
+		filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Uint32())
 		opts := Options{Compression: options.ZSTD, BlockSize: 4 * 1024, BloomFalsePositive: 0.01}
 		opts.BlockCache = cache
 		builder := NewTableBuilder(opts)
@@ -849,7 +849,7 @@ func getTableForBenchmarks(b *testing.B, count int, cache *ristretto.Cache) *Tab
 	opts.BlockCache = cache
 	builder := NewTableBuilder(opts)
 	defer builder.Close()
-	filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Int63())
+	filename := fmt.Sprintf("%s%s%d.sst", os.TempDir(), string(os.PathSeparator), rand.Uint32())
 	for i := 0; i < count; i++ {
 		k := fmt.Sprintf("%016x", i)
 		v := fmt.Sprintf("%d", i)

diff --git a/value_test.go b/value_test.go
@@ -969,8 +969,9 @@ func BenchmarkReadWrite(b *testing.B) {
 				dir, err := os.MkdirTemp("", "vlog-benchmark")
 				y.Check(err)
 				defer removeDir(dir)
-
-				db, err := Open(getTestOptions(dir))
+				opts := getTestOptions(dir)
+				opts.ValueThreshold = 0
+				db, err := Open(opts)
 				y.Check(err)
 
 				vl := &db.vlog