opt: performance improvements for histograms

This commit improves the performance of histograms in the optimizer by avoiding allocating and copying the full histogram unless strictly necessary. Additionally, it changes the code for filtering histograms to use binary search instead of performing a linear scan. Release note: None
cockroachdb · Aug 1, 2019 · 3243046 · 3243046
1 parent 91f48bd
commit 3243046
Show file tree

Hide file tree

Showing 8 changed files with 154 additions and 147 deletions.
diff --git a/pkg/sql/opt/cat/table.go b/pkg/sql/opt/cat/table.go
@@ -182,15 +182,17 @@ type TableStatistic interface {
 	Histogram() []HistogramBucket
 }
 
-// HistogramBucket contains the data for a single histogram bucket.
+// HistogramBucket contains the data for a single histogram bucket. Note
+// that NumEq, NumRange, and DistinctRange are floats so the statisticsBuilder
+// can apply filters to the histogram.
 type HistogramBucket struct {
 	// NumEq is the estimated number of values equal to UpperBound.
-	NumEq uint64
+	NumEq float64
 
 	// NumRange is the estimated number of values between the upper bound of the
 	// previous bucket and UpperBound (both boundaries are exclusive).
 	// The first bucket should always have NumRange=0.
-	NumRange uint64
+	NumRange float64
 
 	// DistinctRange is the estimated number of distinct values between the upper
 	// bound of the previous bucket and UpperBound (both boundaries are

diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -554,23 +554,13 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro
 
 	inputColStat := sb.colStatTable(scan.Table, colSet)
 	colStat := sb.copyColStat(colSet, s, inputColStat)
-	if inputColStat.Histogram != nil {
-		colStat.Histogram = inputColStat.Histogram.Copy()
-	}
+	colStat.Histogram = inputColStat.Histogram
 
 	if s.Selectivity != 1 {
 		tableStats := sb.makeTableStatistics(scan.Table)
 		colStat.ApplySelectivity(s.Selectivity, tableStats.RowCount)
 	}
 
-	// Cap distinct and null counts at limit, if it exists.
-	if scan.HardLimit.IsSet() {
-		if limit := float64(scan.HardLimit.RowCount()); limit < s.RowCount {
-			colStat.DistinctCount = min(colStat.DistinctCount, limit)
-			colStat.NullCount = min(colStat.NullCount, limit)
-		}
-	}
-
 	if colSet.SubsetOf(relProps.NotNullCols) {
 		colStat.NullCount = 0
 	}
@@ -2215,7 +2205,7 @@ func (sb *statisticsBuilder) finalizeFromRowCount(
 	if colStat.Histogram != nil {
 		valuesCount := colStat.Histogram.ValuesCount()
 		if valuesCount > rowCount {
-			colStat.Histogram.ApplySelectivity(rowCount / valuesCount)
+			colStat.Histogram = colStat.Histogram.ApplySelectivity(rowCount / valuesCount)
 		}
 	}
 }

diff --git a/pkg/sql/opt/memo/testdata/stats_quality/tpch b/pkg/sql/opt/memo/testdata/stats_quality/tpch
@@ -2096,8 +2096,8 @@ sort
       │    │    │    │    ├── save-table-name: q7_select_14
       │    │    │    │    ├── columns: l_orderkey:8(int!null) l_suppkey:10(int!null) l_extendedprice:13(float!null) l_discount:14(float!null) l_shipdate:18(date!null)
       │    │    │    │    ├── stats: [rows=1885529.35, distinct(8)=1180296.59, null(8)=0, distinct(10)=9920, null(10)=0, distinct(13)=845601.687, null(13)=0, distinct(14)=11, null(14)=0, distinct(18)=731, null(18)=0, distinct(13,14)=1885529.35, null(13,14)=0]
-      │    │    │    │    │   histogram(18)=  0       0        0.0092037   0.0016004    0.016006   0.0036015    0.019208   0.0024005    0.018007   0.0016004    0.018808   0.0020004    0.018007   0.0024005    0.019208   0.0024005    0.018808   0.0012003    0.019208   0.0012003    0.019208   0.0016004    0.016807   0.0036015    0.018407   0.0024005    0.018808   0.0020004    0.018007   0.0016004    0.019208   0.0032007    0.018007   0.0020004    0.018808   0.0024005    0.019208   0.0020004    0.017607   0.0020004    0.018407   0.0012003    0.018808   0.0024005    0.018808   0.0016004    0.018808   0.00080018   0.018808   0.00080018   0.018007   0.0016004    0.017207   0.0036015    0.018007   0.0032007    0.016006   0.0040016    0.018007   0.0020004    0.018808   0.00040009   0.018007   0.0024005    0.018808   0.0016004    0.017207   0.0024005    0.017607   0.0032007    0.017207   0.0020004    0.018007   0.0024005    0.018007   0.0020004    0.018407   0.0024005    0.018007   0.0036015    0.018808   0.0016004    0.018007   0.0024005    0.018808   0.00040009   0.016006   0.0036015    0.018808   0.00080018   0.018407   0.0012003    0.017607   0.0036015    0.018808   0.0024005    0.017607   0.0024005    0.018808   0.00080018   0.018007   0.0012003    0.018007   0.0032007    0.018808   0.00080018   0.017607   0.0016004    0.017607   0.0044017    0.017607   0.0020004    0.018407   0.0028006    0.017607   0.0016004    0.018808   0.0020004    0.018007   0.0024005    0.018407   0.00080018   0.017607   0.0024005    0.016807   0.0032007    0.0066936   0.0016734
-      │    │    │    │    │                 <--- '1994-12-31' ----------- '1995-01-07' ---------- '1995-01-18' ---------- '1995-01-31' ---------- '1995-02-12' ---------- '1995-02-23' ---------- '1995-03-04' ---------- '1995-03-16' ---------- '1995-03-29' ---------- '1995-04-09' ---------- '1995-04-22' ---------- '1995-05-05' ---------- '1995-05-20' ---------- '1995-05-31' ---------- '1995-06-13' ---------- '1995-06-25' ---------- '1995-07-07' ---------- '1995-07-22' ---------- '1995-08-04' ---------- '1995-08-16' ---------- '1995-08-29' ---------- '1995-09-09' ---------- '1995-09-20' ---------- '1995-10-02' ---------- '1995-10-16' ---------- '1995-10-26' ---------- '1995-11-03' ---------- '1995-11-16' ---------- '1995-11-28' ---------- '1995-12-08' ---------- '1995-12-19' ---------- '1995-12-31' ---------- '1996-01-12' ---------- '1996-01-22' ---------- '1996-02-01' ---------- '1996-02-10' ---------- '1996-02-21' ---------- '1996-03-02' ---------- '1996-03-13' ---------- '1996-03-25' ---------- '1996-04-06' ---------- '1996-04-18' ---------- '1996-04-29' ---------- '1996-05-10' ---------- '1996-05-21' ---------- '1996-06-03' ---------- '1996-06-15' ---------- '1996-06-28' ---------- '1996-07-08' ---------- '1996-07-23' ---------- '1996-08-05' ---------- '1996-08-17' ---------- '1996-08-29' ---------- '1996-09-10' ---------- '1996-09-20' ---------- '1996-10-03' ---------- '1996-10-15' ---------- '1996-10-28' ---------- '1996-11-10' ---------- '1996-11-22' ---------- '1996-12-02' ---------- '1996-12-17' ---------- '1996-12-26' ----------- '1996-12-31'
+      │    │    │    │    │   histogram(18)=  0       0        13802      2400      24004      5401      28805      3600      27005      2400      28205      3000      27005      3600      28805      3600      28205      1800      28805      1800      28805      2400      25205      5401      27605      3600      28205      3000      27005      2400      28805      4800      27005      3000      28205      3600      28805      3000      26405      3000      27605      1800      28205      3600      28205      2400      28205      1200      28205      1200      27005      2400      25805      5401      27005      4800      24004      6001      27005      3000      28205      600       27005      3600      28205      2400      25805      3600      26405      4800      25805      3000      27005      3600      27005      3000      27605      3600      27005      5401      28205      2400      27005      3600      28205      600       24004      5401      28205      1200      27605      1800      26405      5401      28205      3600      26405      3600      28205      1200      27005      1800      27005      4800      28205      1200      26405      2400      26405      6601      26405      3000      27605      4200      26405      2400      28205      3000      27005      3600      27605      1200      26405      3600      25205      4800      10038     2509.5
+      │    │    │    │    │                 <--- '1994-12-31' ------- '1995-01-07' ------- '1995-01-18' ------- '1995-01-31' ------- '1995-02-12' ------- '1995-02-23' ------- '1995-03-04' ------- '1995-03-16' ------- '1995-03-29' ------- '1995-04-09' ------- '1995-04-22' ------- '1995-05-05' ------- '1995-05-20' ------- '1995-05-31' ------- '1995-06-13' ------- '1995-06-25' ------- '1995-07-07' ------- '1995-07-22' ------- '1995-08-04' ------- '1995-08-16' ------- '1995-08-29' ------- '1995-09-09' ------- '1995-09-20' ------- '1995-10-02' ------- '1995-10-16' ------- '1995-10-26' ------- '1995-11-03' ------- '1995-11-16' ------- '1995-11-28' ------- '1995-12-08' ------- '1995-12-19' ------- '1995-12-31' ------- '1996-01-12' ------- '1996-01-22' ------- '1996-02-01' ------- '1996-02-10' ------- '1996-02-21' ------- '1996-03-02' ------- '1996-03-13' ------- '1996-03-25' ------- '1996-04-06' ------- '1996-04-18' ------- '1996-04-29' ------- '1996-05-10' ------- '1996-05-21' ------- '1996-06-03' ------- '1996-06-15' ------- '1996-06-28' ------- '1996-07-08' ------- '1996-07-23' ------- '1996-08-05' ------- '1996-08-17' ------- '1996-08-29' ------- '1996-09-10' ------- '1996-09-20' ------- '1996-10-03' ------- '1996-10-15' ------- '1996-10-28' ------- '1996-11-10' ------- '1996-11-22' ------- '1996-12-02' ------- '1996-12-17' ------- '1996-12-26' ------- '1996-12-31'
       │    │    │    │    ├── scan lineitem
       │    │    │    │    │    ├── save-table-name: q7_scan_15
       │    │    │    │    │    ├── columns: l_orderkey:8(int!null) l_suppkey:10(int!null) l_extendedprice:13(float!null) l_discount:14(float!null) l_shipdate:18(date!null)

diff --git a/pkg/sql/opt/props/histogram.go b/pkg/sql/opt/props/histogram.go
@@ -15,6 +15,7 @@ import (
 	"fmt"
 	"io"
 	"math"
+	"sort"
 
 	"github.com/cockroachdb/cockroach/pkg/sql/opt"
 	"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
@@ -30,27 +31,7 @@ import (
 type Histogram struct {
 	evalCtx *tree.EvalContext
 	col     opt.ColumnID
-	buckets []HistogramBucket
-}
-
-// HistogramBucket contains the data for a single bucket in a Histogram. Note
-// that NumEq, NumRange, and DistinctRange are floats so the statisticsBuilder
-// can apply filters to the histogram.
-type HistogramBucket struct {
-	// NumEq is the estimated number of values equal to UpperBound.
-	NumEq float64
-
-	// NumRange is the estimated number of values in this bucket not equal to
-	// UpperBound.
-	NumRange float64
-
-	// DistinctRange is the estimated number of distinct values in this bucket
-	// not equal to UpperBound.
-	DistinctRange float64
-
-	// UpperBound is the largest value in this bucket. The lower bound can be
-	// inferred based on the upper bound of the previous bucket in the histogram.
-	UpperBound tree.Datum
+	buckets []cat.HistogramBucket
 }
 
 func (h *Histogram) String() string {
@@ -67,21 +48,12 @@ func (h *Histogram) Init(
 ) {
 	h.evalCtx = evalCtx
 	h.col = col
-	if len(buckets) == 0 {
-		return
-	}
-	h.buckets = make([]HistogramBucket, len(buckets))
-	for i := range buckets {
-		h.buckets[i].NumEq = float64(buckets[i].NumEq)
-		h.buckets[i].NumRange = float64(buckets[i].NumRange)
-		h.buckets[i].DistinctRange = buckets[i].DistinctRange
-		h.buckets[i].UpperBound = buckets[i].UpperBound
-	}
+	h.buckets = buckets
 }
 
 // Copy returns a deep copy of the histogram.
 func (h *Histogram) Copy() *Histogram {
-	buckets := make([]HistogramBucket, len(h.buckets))
+	buckets := make([]cat.HistogramBucket, len(h.buckets))
 	copy(buckets, h.buckets)
 	return &Histogram{
 		evalCtx: h.evalCtx,
@@ -97,7 +69,7 @@ func (h *Histogram) BucketCount() int {
 
 // Bucket returns a pointer to the ith bucket in the histogram.
 // i must be greater than or equal to 0 and less than BucketCount.
-func (h *Histogram) Bucket(i int) *HistogramBucket {
+func (h *Histogram) Bucket(i int) *cat.HistogramBucket {
 	return &h.buckets[i]
 }
 
@@ -139,17 +111,17 @@ func (h *Histogram) maxDistinctValuesCount() float64 {
 		return 0
 	}
 
-	// The lower bound for the first bucket is the smallest possible value for
-	// the data type.
-	lowerBound, ok := h.buckets[0].UpperBound.Min(h.evalCtx)
-	if !ok {
-		lowerBound = h.buckets[0].UpperBound
+	// The first bucket always has a zero value for NumRange, so the lower bound
+	// of the histogram is the upper bound of the first bucket.
+	if h.Bucket(0).NumRange != 0 {
+		panic(errors.AssertionFailedf("the first bucket should have NumRange=0"))
 	}
+	lowerBound := h.Bucket(0).UpperBound
 
 	var count float64
 	for i := range h.buckets {
 		b := &h.buckets[i]
-		rng, ok := b.maxDistinctValuesInRange(lowerBound)
+		rng, ok := maxDistinctValuesInRange(lowerBound, b.UpperBound)
 
 		if ok && b.NumRange > rng {
 			count += rng
@@ -168,17 +140,17 @@ func (h *Histogram) maxDistinctValuesCount() float64 {
 }
 
 // maxDistinctValuesInRange returns the maximum number of distinct values in
-// the range of the bucket (i.e., not including the upper bound). It returns
-// ok=false when it is not possible to determine a finite value (which is the
-// case for all types other than integers and dates).
-func (b *HistogramBucket) maxDistinctValuesInRange(lowerBound tree.Datum) (_ float64, ok bool) {
+// the range [lowerBound, upperBound). It returns ok=false when it is not
+// possible to determine a finite value (which is the case for all types other
+// than integers and dates).
+func maxDistinctValuesInRange(lowerBound, upperBound tree.Datum) (_ float64, ok bool) {
 	switch lowerBound.ResolvedType().Family() {
 	case types.IntFamily:
-		return float64(*b.UpperBound.(*tree.DInt)) - float64(*lowerBound.(*tree.DInt)), true
+		return float64(*upperBound.(*tree.DInt)) - float64(*lowerBound.(*tree.DInt)), true
 
 	case types.DateFamily:
 		lower := lowerBound.(*tree.DDate)
-		upper := b.UpperBound.(*tree.DDate)
+		upper := upperBound.(*tree.DDate)
 		if lower.IsFinite() && upper.IsFinite() {
 			return float64(upper.PGEpochDays()) - float64(lower.PGEpochDays()), true
 		}
@@ -215,45 +187,66 @@ func (h *Histogram) Filter(c *constraint.Constraint) *Histogram {
 		panic(errors.AssertionFailedf("histogram filter with descending constraint not yet supported"))
 	}
 
+	bucketCount := h.BucketCount()
 	filtered := &Histogram{
 		evalCtx: h.evalCtx,
 		col:     h.col,
-		buckets: make([]HistogramBucket, 0, len(h.buckets)),
+		buckets: make([]cat.HistogramBucket, 0, bucketCount),
 	}
-	if len(h.buckets) == 0 {
+	if bucketCount == 0 {
 		return filtered
 	}
 
-	// The lower bound for the first bucket is the smallest possible value for
-	// the data type.
-	// TODO(rytaft): Ensure that the first bucket has a zero value for NumRange,
-	// at least for types that don't have a Min.
-	lowerBound, ok := h.buckets[0].UpperBound.Min(h.evalCtx)
-	if !ok {
-		lowerBound = h.buckets[0].UpperBound
+	// The first bucket always has a zero value for NumRange, so the lower bound
+	// of the histogram is the upper bound of the first bucket.
+	if h.Bucket(0).NumRange != 0 {
+		panic(errors.AssertionFailedf("the first bucket should have NumRange=0"))
 	}
-
-	// Use variation on merge sort, because both sets of buckets and spans are
-	// ordered and non-overlapping.
-	// TODO(rytaft): use binary search to find the first bucket.
+	lowerBound := h.Bucket(0).UpperBound
 
 	bucIndex := 0
 	spanIndex := 0
 	keyCtx := constraint.KeyContext{EvalCtx: h.evalCtx}
 	keyCtx.Columns.InitSingle(opt.MakeOrderingColumn(h.col, false /* descending */))
 
-	for bucIndex < h.BucketCount() && spanIndex < c.Spans.Count() {
+	// Find the first span that may overlap with the histogram.
+	firstBucket := makeSpanFromBucket(h.Bucket(bucIndex), lowerBound)
+	spanCount := c.Spans.Count()
+	for spanIndex < spanCount {
+		span := c.Spans.Get(spanIndex)
+		if firstBucket.StartsAfter(&keyCtx, span) {
+			spanIndex++
+			continue
+		}
+		break
+	}
+	if spanIndex == spanCount {
+		return filtered
+	}
+
+	// Use binary search to find the first bucket that overlaps with the span.
+	span := c.Spans.Get(spanIndex)
+	bucIndex = sort.Search(bucketCount, func(i int) bool {
+		// The lower bound of the bucket doesn't matter here since we're just
+		// checking whether the span starts after the *upper bound* of the bucket.
+		bucket := makeSpanFromBucket(h.Bucket(i), lowerBound)
+		return !span.StartsAfter(&keyCtx, &bucket)
+	})
+	if bucIndex == bucketCount {
+		return filtered
+	}
+	if bucIndex > 0 {
+		prevUpperBound := h.Bucket(bucIndex - 1).UpperBound
+		filtered.addEmptyBucket(prevUpperBound)
+		lowerBound = h.getNextLowerBound(prevUpperBound)
+	}
+
+	// For the remaining buckets and spans, use a variation on merge sort.
+	for bucIndex < bucketCount && spanIndex < spanCount {
 		bucket := h.Bucket(bucIndex)
 		// Convert the bucket to a span in order to take advantage of the
 		// constraint library.
-		var left constraint.Span
-		left.Init(
-			constraint.MakeKey(lowerBound),
-			constraint.IncludeBoundary,
-			constraint.MakeKey(bucket.UpperBound),
-			constraint.IncludeBoundary,
-		)
-
+		left := makeSpanFromBucket(bucket, lowerBound)
 		right := c.Spans.Get(spanIndex)
 
 		if left.StartsAfter(&keyCtx, right) {
@@ -273,7 +266,7 @@ func (h *Histogram) Filter(c *constraint.Constraint) *Histogram {
 		if filteredSpan.Compare(&keyCtx, &left) != 0 {
 			// The bucket was cut off in the middle. Get the resulting filtered
 			// bucket.
-			filteredBucket = bucket.getFilteredBucket(&keyCtx, &filteredSpan, lowerBound)
+			filteredBucket = getFilteredBucket(bucket, &keyCtx, &filteredSpan, lowerBound)
 			if filteredSpan.CompareStarts(&keyCtx, &left) != 0 {
 				// We need to add an empty bucket before the new bucket.
 				emptyBucketUpperBound := filteredSpan.StartKey().Value(0)
@@ -311,10 +304,10 @@ func (h *Histogram) getNextLowerBound(currentUpperBound tree.Datum) tree.Datum {
 }
 
 func (h *Histogram) addEmptyBucket(upperBound tree.Datum) {
-	h.addBucket(&HistogramBucket{UpperBound: upperBound})
+	h.addBucket(&cat.HistogramBucket{UpperBound: upperBound})
 }
 
-func (h *Histogram) addBucket(bucket *HistogramBucket) {
+func (h *Histogram) addBucket(bucket *cat.HistogramBucket) {
 	// Check whether we can combine this bucket with the previous bucket.
 	if len(h.buckets) != 0 {
 		lastBucket := &h.buckets[len(h.buckets)-1]
@@ -332,10 +325,11 @@ func (h *Histogram) addBucket(bucket *HistogramBucket) {
 }
 
 // ApplySelectivity reduces the size of each histogram bucket according to
-// the given selectivity.
-func (h *Histogram) ApplySelectivity(selectivity float64) {
-	for i := range h.buckets {
-		b := &h.buckets[i]
+// the given selectivity, and returns a new histogram with the results.
+func (h *Histogram) ApplySelectivity(selectivity float64) *Histogram {
+	res := h.Copy()
+	for i := range res.buckets {
+		b := &res.buckets[i]
 
 		// Save n and d for the distinct count formula below.
 		n := b.NumRange
@@ -356,6 +350,17 @@ func (h *Histogram) ApplySelectivity(selectivity float64) {
 		// when d << n.
 		b.DistinctRange = d - d*math.Pow(1-selectivity, n/d)
 	}
+	return res
+}
+
+func makeSpanFromBucket(b *cat.HistogramBucket, lowerBound tree.Datum) (span constraint.Span) {
+	span.Init(
+		constraint.MakeKey(lowerBound),
+		constraint.IncludeBoundary,
+		constraint.MakeKey(b.UpperBound),
+		constraint.IncludeBoundary,
+	)
+	return span
 }
 
 // getFilteredBucket filters the histogram bucket according to the given span,
@@ -389,9 +394,12 @@ func (h *Histogram) ApplySelectivity(selectivity float64) {
 // the size of NumRange if the bucket is cut off in the middle. In this case,
 // we use the heuristic that NumRange is reduced by half.
 //
-func (b *HistogramBucket) getFilteredBucket(
-	keyCtx *constraint.KeyContext, filteredSpan *constraint.Span, bucketLowerBound tree.Datum,
-) *HistogramBucket {
+func getFilteredBucket(
+	b *cat.HistogramBucket,
+	keyCtx *constraint.KeyContext,
+	filteredSpan *constraint.Span,
+	bucketLowerBound tree.Datum,
+) *cat.HistogramBucket {
 	spanLowerBound := filteredSpan.StartKey().Value(0)
 	spanUpperBound := filteredSpan.EndKey().Value(0)
 
@@ -506,7 +514,7 @@ func (b *HistogramBucket) getFilteredBucket(
 		distinctCountRange = b.DistinctRange * numRange / b.NumRange
 	}
 
-	return &HistogramBucket{
+	return &cat.HistogramBucket{
 		NumEq:         numEq,
 		NumRange:      numRange,
 		DistinctRange: distinctCountRange,
@@ -537,7 +545,7 @@ const (
 	boundaries
 )
 
-func (w *histogramWriter) init(buckets []HistogramBucket) {
+func (w *histogramWriter) init(buckets []cat.HistogramBucket) {
 	w.cells = [][]string{
 		make([]string, len(buckets)*2),
 		make([]string, len(buckets)*2),
@@ -562,6 +570,10 @@ func (w *histogramWriter) init(buckets []HistogramBucket) {
 }
 
 func (w *histogramWriter) write(out io.Writer) {
+	if len(w.cells[counts]) == 0 {
+		return
+	}
+
 	// Print a space to match up with the "<" character below.
 	fmt.Fprint(out, " ")
 	for i := range w.cells[counts] {