opt: performance improvements for histograms

This commit improves the performance of histograms in the optimizer by avoiding allocating and copying the full histogram unless strictly necessary. Additionally, it changes the code for filtering histograms to use binary search instead of performing a linear scan. Release note: None
cockroachdb · Jul 30, 2019 · 80b804d · 80b804d
1 parent 9e29ebe
commit 80b804d
Show file tree

Hide file tree

Showing 6 changed files with 150 additions and 129 deletions.
diff --git a/pkg/sql/opt/cat/table.go b/pkg/sql/opt/cat/table.go
@@ -182,15 +182,17 @@ type TableStatistic interface {
 	Histogram() []HistogramBucket
 }
 
-// HistogramBucket contains the data for a single histogram bucket.
+// HistogramBucket contains the data for a single histogram bucket. Note
+// that NumEq, NumRange, and DistinctRange are floats so the statisticsBuilder
+// can apply filters to the histogram.
 type HistogramBucket struct {
 	// NumEq is the estimated number of values equal to UpperBound.
-	NumEq uint64
+	NumEq float64
 
 	// NumRange is the estimated number of values between the upper bound of the
 	// previous bucket and UpperBound (both boundaries are exclusive).
 	// The first bucket should always have NumRange=0.
-	NumRange uint64
+	NumRange float64
 
 	// DistinctRange is the estimated number of distinct values between the upper
 	// bound of the previous bucket and UpperBound (both boundaries are

diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -555,7 +555,13 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro
 	inputColStat := sb.colStatTable(scan.Table, colSet)
 	colStat := sb.copyColStat(colSet, s, inputColStat)
 	if inputColStat.Histogram != nil {
-		colStat.Histogram = inputColStat.Histogram.Copy()
+		if s.Selectivity != 1 || scan.HardLimit.IsSet() {
+			colStat.Histogram = inputColStat.Histogram.Copy()
+		} else {
+			// We don't need to make a deep copy of the histogram because we won't be
+			// modifying it.
+			colStat.Histogram = inputColStat.Histogram
+		}
 	}
 
 	if s.Selectivity != 1 {

diff --git a/pkg/sql/opt/props/histogram.go b/pkg/sql/opt/props/histogram.go
@@ -15,6 +15,7 @@ import (
 	"fmt"
 	"io"
 	"math"
+	"sort"
 
 	"github.com/cockroachdb/cockroach/pkg/sql/opt"
 	"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
@@ -30,27 +31,7 @@ import (
 type Histogram struct {
 	evalCtx *tree.EvalContext
 	col     opt.ColumnID
-	buckets []HistogramBucket
-}
-
-// HistogramBucket contains the data for a single bucket in a Histogram. Note
-// that NumEq, NumRange, and DistinctRange are floats so the statisticsBuilder
-// can apply filters to the histogram.
-type HistogramBucket struct {
-	// NumEq is the estimated number of values equal to UpperBound.
-	NumEq float64
-
-	// NumRange is the estimated number of values in this bucket not equal to
-	// UpperBound.
-	NumRange float64
-
-	// DistinctRange is the estimated number of distinct values in this bucket
-	// not equal to UpperBound.
-	DistinctRange float64
-
-	// UpperBound is the largest value in this bucket. The lower bound can be
-	// inferred based on the upper bound of the previous bucket in the histogram.
-	UpperBound tree.Datum
+	buckets []cat.HistogramBucket
 }
 
 func (h *Histogram) String() string {
@@ -67,21 +48,12 @@ func (h *Histogram) Init(
 ) {
 	h.evalCtx = evalCtx
 	h.col = col
-	if len(buckets) == 0 {
-		return
-	}
-	h.buckets = make([]HistogramBucket, len(buckets))
-	for i := range buckets {
-		h.buckets[i].NumEq = float64(buckets[i].NumEq)
-		h.buckets[i].NumRange = float64(buckets[i].NumRange)
-		h.buckets[i].DistinctRange = buckets[i].DistinctRange
-		h.buckets[i].UpperBound = buckets[i].UpperBound
-	}
+	h.buckets = buckets
 }
 
 // Copy returns a deep copy of the histogram.
 func (h *Histogram) Copy() *Histogram {
-	buckets := make([]HistogramBucket, len(h.buckets))
+	buckets := make([]cat.HistogramBucket, len(h.buckets))
 	copy(buckets, h.buckets)
 	return &Histogram{
 		evalCtx: h.evalCtx,
@@ -97,7 +69,7 @@ func (h *Histogram) BucketCount() int {
 
 // Bucket returns a pointer to the ith bucket in the histogram.
 // i must be greater than or equal to 0 and less than BucketCount.
-func (h *Histogram) Bucket(i int) *HistogramBucket {
+func (h *Histogram) Bucket(i int) *cat.HistogramBucket {
 	return &h.buckets[i]
 }
 
@@ -139,17 +111,17 @@ func (h *Histogram) maxDistinctValuesCount() float64 {
 		return 0
 	}
 
-	// The lower bound for the first bucket is the smallest possible value for
-	// the data type.
-	lowerBound, ok := h.buckets[0].UpperBound.Min(h.evalCtx)
-	if !ok {
-		lowerBound = h.buckets[0].UpperBound
+	// The first bucket always has a zero value for NumRange, so the lower bound
+	// of the histogram is the upper bound of the first bucket.
+	if h.Bucket(0).NumRange != 0 {
+		panic(errors.AssertionFailedf("the first bucket should have NumRange=0"))
 	}
+	lowerBound := h.Bucket(0).UpperBound
 
 	var count float64
 	for i := range h.buckets {
 		b := &h.buckets[i]
-		rng, ok := b.maxDistinctValuesInRange(lowerBound)
+		rng, ok := maxDistinctValuesInRange(lowerBound, b.UpperBound)
 
 		if ok && b.NumRange > rng {
 			count += rng
@@ -168,17 +140,17 @@ func (h *Histogram) maxDistinctValuesCount() float64 {
 }
 
 // maxDistinctValuesInRange returns the maximum number of distinct values in
-// the range of the bucket (i.e., not including the upper bound). It returns
-// ok=false when it is not possible to determine a finite value (which is the
-// case for all types other than integers and dates).
-func (b *HistogramBucket) maxDistinctValuesInRange(lowerBound tree.Datum) (_ float64, ok bool) {
+// the range [lowerBound, upperBound). It returns ok=false when it is not
+// possible to determine a finite value (which is the case for all types other
+// than integers and dates).
+func maxDistinctValuesInRange(lowerBound, upperBound tree.Datum) (_ float64, ok bool) {
 	switch lowerBound.ResolvedType().Family() {
 	case types.IntFamily:
-		return float64(*b.UpperBound.(*tree.DInt)) - float64(*lowerBound.(*tree.DInt)), true
+		return float64(*upperBound.(*tree.DInt)) - float64(*lowerBound.(*tree.DInt)), true
 
 	case types.DateFamily:
 		lower := lowerBound.(*tree.DDate)
-		upper := b.UpperBound.(*tree.DDate)
+		upper := upperBound.(*tree.DDate)
 		if lower.IsFinite() && upper.IsFinite() {
 			return float64(upper.PGEpochDays()) - float64(lower.PGEpochDays()), true
 		}
@@ -215,45 +187,66 @@ func (h *Histogram) Filter(c *constraint.Constraint) *Histogram {
 		panic(errors.AssertionFailedf("histogram filter with descending constraint not yet supported"))
 	}
 
+	bucketCount := h.BucketCount()
 	filtered := &Histogram{
 		evalCtx: h.evalCtx,
 		col:     h.col,
-		buckets: make([]HistogramBucket, 0, len(h.buckets)),
+		buckets: make([]cat.HistogramBucket, 0, bucketCount),
 	}
-	if len(h.buckets) == 0 {
+	if bucketCount == 0 {
 		return filtered
 	}
 
-	// The lower bound for the first bucket is the smallest possible value for
-	// the data type.
-	// TODO(rytaft): Ensure that the first bucket has a zero value for NumRange,
-	// at least for types that don't have a Min.
-	lowerBound, ok := h.buckets[0].UpperBound.Min(h.evalCtx)
-	if !ok {
-		lowerBound = h.buckets[0].UpperBound
+	// The first bucket always has a zero value for NumRange, so the lower bound
+	// of the histogram is the upper bound of the first bucket.
+	if h.Bucket(0).NumRange != 0 {
+		panic(errors.AssertionFailedf("the first bucket should have NumRange=0"))
 	}
-
-	// Use variation on merge sort, because both sets of buckets and spans are
-	// ordered and non-overlapping.
-	// TODO(rytaft): use binary search to find the first bucket.
+	lowerBound := h.Bucket(0).UpperBound
 
 	bucIndex := 0
 	spanIndex := 0
 	keyCtx := constraint.KeyContext{EvalCtx: h.evalCtx}
 	keyCtx.Columns.InitSingle(opt.MakeOrderingColumn(h.col, false /* descending */))
 
-	for bucIndex < h.BucketCount() && spanIndex < c.Spans.Count() {
+	// Find the first span that may overlap with the histogram.
+	firstBucket := makeSpanFromBucket(h.Bucket(bucIndex), lowerBound)
+	spanCount := c.Spans.Count()
+	for spanIndex < spanCount {
+		span := c.Spans.Get(spanIndex)
+		if firstBucket.StartsAfter(&keyCtx, span) {
+			spanIndex++
+			continue
+		}
+		break
+	}
+	if spanIndex == spanCount {
+		return filtered
+	}
+
+	// Use binary search to find the first bucket that overlaps with the span.
+	span := c.Spans.Get(spanIndex)
+	bucIndex = sort.Search(bucketCount, func(i int) bool {
+		// The lower bound of the bucket doesn't matter here since we're just
+		// checking whether the span starts after the *upper bound* of the bucket.
+		bucket := makeSpanFromBucket(h.Bucket(i), lowerBound)
+		return !span.StartsAfter(&keyCtx, &bucket)
+	})
+	if bucIndex == bucketCount {
+		return filtered
+	}
+	if bucIndex > 0 {
+		prevUpperBound := h.Bucket(bucIndex - 1).UpperBound
+		filtered.addEmptyBucket(prevUpperBound)
+		lowerBound = h.getNextLowerBound(prevUpperBound)
+	}
+
+	// For the remaining buckets and spans, use a variation on merge sort.
+	for bucIndex < bucketCount && spanIndex < spanCount {
 		bucket := h.Bucket(bucIndex)
 		// Convert the bucket to a span in order to take advantage of the
 		// constraint library.
-		var left constraint.Span
-		left.Init(
-			constraint.MakeKey(lowerBound),
-			constraint.IncludeBoundary,
-			constraint.MakeKey(bucket.UpperBound),
-			constraint.IncludeBoundary,
-		)
-
+		left := makeSpanFromBucket(bucket, lowerBound)
 		right := c.Spans.Get(spanIndex)
 
 		if left.StartsAfter(&keyCtx, right) {
@@ -273,7 +266,7 @@ func (h *Histogram) Filter(c *constraint.Constraint) *Histogram {
 		if filteredSpan.Compare(&keyCtx, &left) != 0 {
 			// The bucket was cut off in the middle. Get the resulting filtered
 			// bucket.
-			filteredBucket = bucket.getFilteredBucket(&keyCtx, &filteredSpan, lowerBound)
+			filteredBucket = getFilteredBucket(bucket, &keyCtx, &filteredSpan, lowerBound)
 			if filteredSpan.CompareStarts(&keyCtx, &left) != 0 {
 				// We need to add an empty bucket before the new bucket.
 				emptyBucketUpperBound := filteredSpan.StartKey().Value(0)
@@ -311,10 +304,10 @@ func (h *Histogram) getNextLowerBound(currentUpperBound tree.Datum) tree.Datum {
 }
 
 func (h *Histogram) addEmptyBucket(upperBound tree.Datum) {
-	h.addBucket(&HistogramBucket{UpperBound: upperBound})
+	h.addBucket(&cat.HistogramBucket{UpperBound: upperBound})
 }
 
-func (h *Histogram) addBucket(bucket *HistogramBucket) {
+func (h *Histogram) addBucket(bucket *cat.HistogramBucket) {
 	// Check whether we can combine this bucket with the previous bucket.
 	if len(h.buckets) != 0 {
 		lastBucket := &h.buckets[len(h.buckets)-1]
@@ -358,6 +351,16 @@ func (h *Histogram) ApplySelectivity(selectivity float64) {
 	}
 }
 
+func makeSpanFromBucket(b *cat.HistogramBucket, lowerBound tree.Datum) (span constraint.Span) {
+	span.Init(
+		constraint.MakeKey(lowerBound),
+		constraint.IncludeBoundary,
+		constraint.MakeKey(b.UpperBound),
+		constraint.IncludeBoundary,
+	)
+	return span
+}
+
 // getFilteredBucket filters the histogram bucket according to the given span,
 // and returns a new bucket with the results. The span represents the maximum
 // range of values that remain in the bucket after filtering. The span must
@@ -389,9 +392,12 @@ func (h *Histogram) ApplySelectivity(selectivity float64) {
 // the size of NumRange if the bucket is cut off in the middle. In this case,
 // we use the heuristic that NumRange is reduced by half.
 //
-func (b *HistogramBucket) getFilteredBucket(
-	keyCtx *constraint.KeyContext, filteredSpan *constraint.Span, bucketLowerBound tree.Datum,
-) *HistogramBucket {
+func getFilteredBucket(
+	b *cat.HistogramBucket,
+	keyCtx *constraint.KeyContext,
+	filteredSpan *constraint.Span,
+	bucketLowerBound tree.Datum,
+) *cat.HistogramBucket {
 	spanLowerBound := filteredSpan.StartKey().Value(0)
 	spanUpperBound := filteredSpan.EndKey().Value(0)
 
@@ -506,7 +512,7 @@ func (b *HistogramBucket) getFilteredBucket(
 		distinctCountRange = b.DistinctRange * numRange / b.NumRange
 	}
 
-	return &HistogramBucket{
+	return &cat.HistogramBucket{
 		NumEq:         numEq,
 		NumRange:      numRange,
 		DistinctRange: distinctCountRange,
@@ -537,7 +543,7 @@ const (
 	boundaries
 )
 
-func (w *histogramWriter) init(buckets []HistogramBucket) {
+func (w *histogramWriter) init(buckets []cat.HistogramBucket) {
 	w.cells = [][]string{
 		make([]string, len(buckets)*2),
 		make([]string, len(buckets)*2),
@@ -562,6 +568,10 @@ func (w *histogramWriter) init(buckets []HistogramBucket) {
 }
 
 func (w *histogramWriter) write(out io.Writer) {
+	if len(w.cells[counts]) == 0 {
+		return
+	}
+
 	// Print a space to match up with the "<" character below.
 	fmt.Fprint(out, " ")
 	for i := range w.cells[counts] {