Skip to content

Commit

Permalink
opt: performance improvements for histograms
Browse files Browse the repository at this point in the history
This commit improves the performance of histograms in the optimizer
by avoiding allocating and copying the full histogram unless strictly
necessary. Additionally, it changes the code for filtering histograms
to use binary search instead of performing a linear scan.

Release note: None
  • Loading branch information
rytaft committed Aug 1, 2019
1 parent 91f48bd commit 3243046
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 147 deletions.
8 changes: 5 additions & 3 deletions pkg/sql/opt/cat/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,15 +182,17 @@ type TableStatistic interface {
Histogram() []HistogramBucket
}

// HistogramBucket contains the data for a single histogram bucket.
// HistogramBucket contains the data for a single histogram bucket. Note
// that NumEq, NumRange, and DistinctRange are floats so the statisticsBuilder
// can apply filters to the histogram.
type HistogramBucket struct {
// NumEq is the estimated number of values equal to UpperBound.
NumEq uint64
NumEq float64

// NumRange is the estimated number of values between the upper bound of the
// previous bucket and UpperBound (both boundaries are exclusive).
// The first bucket should always have NumRange=0.
NumRange uint64
NumRange float64

// DistinctRange is the estimated number of distinct values between the upper
// bound of the previous bucket and UpperBound (both boundaries are
Expand Down
14 changes: 2 additions & 12 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -554,23 +554,13 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro

inputColStat := sb.colStatTable(scan.Table, colSet)
colStat := sb.copyColStat(colSet, s, inputColStat)
if inputColStat.Histogram != nil {
colStat.Histogram = inputColStat.Histogram.Copy()
}
colStat.Histogram = inputColStat.Histogram

if s.Selectivity != 1 {
tableStats := sb.makeTableStatistics(scan.Table)
colStat.ApplySelectivity(s.Selectivity, tableStats.RowCount)
}

// Cap distinct and null counts at limit, if it exists.
if scan.HardLimit.IsSet() {
if limit := float64(scan.HardLimit.RowCount()); limit < s.RowCount {
colStat.DistinctCount = min(colStat.DistinctCount, limit)
colStat.NullCount = min(colStat.NullCount, limit)
}
}

if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
Expand Down Expand Up @@ -2215,7 +2205,7 @@ func (sb *statisticsBuilder) finalizeFromRowCount(
if colStat.Histogram != nil {
valuesCount := colStat.Histogram.ValuesCount()
if valuesCount > rowCount {
colStat.Histogram.ApplySelectivity(rowCount / valuesCount)
colStat.Histogram = colStat.Histogram.ApplySelectivity(rowCount / valuesCount)
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/opt/memo/testdata/stats_quality/tpch
Original file line number Diff line number Diff line change
Expand Up @@ -2096,8 +2096,8 @@ sort
│ │ │ │ ├── save-table-name: q7_select_14
│ │ │ │ ├── columns: l_orderkey:8(int!null) l_suppkey:10(int!null) l_extendedprice:13(float!null) l_discount:14(float!null) l_shipdate:18(date!null)
│ │ │ │ ├── stats: [rows=1885529.35, distinct(8)=1180296.59, null(8)=0, distinct(10)=9920, null(10)=0, distinct(13)=845601.687, null(13)=0, distinct(14)=11, null(14)=0, distinct(18)=731, null(18)=0, distinct(13,14)=1885529.35, null(13,14)=0]
│ │ │ │ │ histogram(18)= 0 0 0.0092037 0.0016004 0.016006 0.0036015 0.019208 0.0024005 0.018007 0.0016004 0.018808 0.0020004 0.018007 0.0024005 0.019208 0.0024005 0.018808 0.0012003 0.019208 0.0012003 0.019208 0.0016004 0.016807 0.0036015 0.018407 0.0024005 0.018808 0.0020004 0.018007 0.0016004 0.019208 0.0032007 0.018007 0.0020004 0.018808 0.0024005 0.019208 0.0020004 0.017607 0.0020004 0.018407 0.0012003 0.018808 0.0024005 0.018808 0.0016004 0.018808 0.00080018 0.018808 0.00080018 0.018007 0.0016004 0.017207 0.0036015 0.018007 0.0032007 0.016006 0.0040016 0.018007 0.0020004 0.018808 0.00040009 0.018007 0.0024005 0.018808 0.0016004 0.017207 0.0024005 0.017607 0.0032007 0.017207 0.0020004 0.018007 0.0024005 0.018007 0.0020004 0.018407 0.0024005 0.018007 0.0036015 0.018808 0.0016004 0.018007 0.0024005 0.018808 0.00040009 0.016006 0.0036015 0.018808 0.00080018 0.018407 0.0012003 0.017607 0.0036015 0.018808 0.0024005 0.017607 0.0024005 0.018808 0.00080018 0.018007 0.0012003 0.018007 0.0032007 0.018808 0.00080018 0.017607 0.0016004 0.017607 0.0044017 0.017607 0.0020004 0.018407 0.0028006 0.017607 0.0016004 0.018808 0.0020004 0.018007 0.0024005 0.018407 0.00080018 0.017607 0.0024005 0.016807 0.0032007 0.0066936 0.0016734
│ │ │ │ │ <--- '1994-12-31' ----------- '1995-01-07' ---------- '1995-01-18' ---------- '1995-01-31' ---------- '1995-02-12' ---------- '1995-02-23' ---------- '1995-03-04' ---------- '1995-03-16' ---------- '1995-03-29' ---------- '1995-04-09' ---------- '1995-04-22' ---------- '1995-05-05' ---------- '1995-05-20' ---------- '1995-05-31' ---------- '1995-06-13' ---------- '1995-06-25' ---------- '1995-07-07' ---------- '1995-07-22' ---------- '1995-08-04' ---------- '1995-08-16' ---------- '1995-08-29' ---------- '1995-09-09' ---------- '1995-09-20' ---------- '1995-10-02' ---------- '1995-10-16' ---------- '1995-10-26' ---------- '1995-11-03' ---------- '1995-11-16' ---------- '1995-11-28' ---------- '1995-12-08' ---------- '1995-12-19' ---------- '1995-12-31' ---------- '1996-01-12' ---------- '1996-01-22' ---------- '1996-02-01' ---------- '1996-02-10' ---------- '1996-02-21' ---------- '1996-03-02' ---------- '1996-03-13' ---------- '1996-03-25' ---------- '1996-04-06' ---------- '1996-04-18' ---------- '1996-04-29' ---------- '1996-05-10' ---------- '1996-05-21' ---------- '1996-06-03' ---------- '1996-06-15' ---------- '1996-06-28' ---------- '1996-07-08' ---------- '1996-07-23' ---------- '1996-08-05' ---------- '1996-08-17' ---------- '1996-08-29' ---------- '1996-09-10' ---------- '1996-09-20' ---------- '1996-10-03' ---------- '1996-10-15' ---------- '1996-10-28' ---------- '1996-11-10' ---------- '1996-11-22' ---------- '1996-12-02' ---------- '1996-12-17' ---------- '1996-12-26' ----------- '1996-12-31'
│ │ │ │ │ histogram(18)= 0 0 13802 2400 24004 5401 28805 3600 27005 2400 28205 3000 27005 3600 28805 3600 28205 1800 28805 1800 28805 2400 25205 5401 27605 3600 28205 3000 27005 2400 28805 4800 27005 3000 28205 3600 28805 3000 26405 3000 27605 1800 28205 3600 28205 2400 28205 1200 28205 1200 27005 2400 25805 5401 27005 4800 24004 6001 27005 3000 28205 600 27005 3600 28205 2400 25805 3600 26405 4800 25805 3000 27005 3600 27005 3000 27605 3600 27005 5401 28205 2400 27005 3600 28205 600 24004 5401 28205 1200 27605 1800 26405 5401 28205 3600 26405 3600 28205 1200 27005 1800 27005 4800 28205 1200 26405 2400 26405 6601 26405 3000 27605 4200 26405 2400 28205 3000 27005 3600 27605 1200 26405 3600 25205 4800 10038 2509.5
│ │ │ │ │ <--- '1994-12-31' ------- '1995-01-07' ------- '1995-01-18' ------- '1995-01-31' ------- '1995-02-12' ------- '1995-02-23' ------- '1995-03-04' ------- '1995-03-16' ------- '1995-03-29' ------- '1995-04-09' ------- '1995-04-22' ------- '1995-05-05' ------- '1995-05-20' ------- '1995-05-31' ------- '1995-06-13' ------- '1995-06-25' ------- '1995-07-07' ------- '1995-07-22' ------- '1995-08-04' ------- '1995-08-16' ------- '1995-08-29' ------- '1995-09-09' ------- '1995-09-20' ------- '1995-10-02' ------- '1995-10-16' ------- '1995-10-26' ------- '1995-11-03' ------- '1995-11-16' ------- '1995-11-28' ------- '1995-12-08' ------- '1995-12-19' ------- '1995-12-31' ------- '1996-01-12' ------- '1996-01-22' ------- '1996-02-01' ------- '1996-02-10' ------- '1996-02-21' ------- '1996-03-02' ------- '1996-03-13' ------- '1996-03-25' ------- '1996-04-06' ------- '1996-04-18' ------- '1996-04-29' ------- '1996-05-10' ------- '1996-05-21' ------- '1996-06-03' ------- '1996-06-15' ------- '1996-06-28' ------- '1996-07-08' ------- '1996-07-23' ------- '1996-08-05' ------- '1996-08-17' ------- '1996-08-29' ------- '1996-09-10' ------- '1996-09-20' ------- '1996-10-03' ------- '1996-10-15' ------- '1996-10-28' ------- '1996-11-10' ------- '1996-11-22' ------- '1996-12-02' ------- '1996-12-17' ------- '1996-12-26' ------- '1996-12-31'
│ │ │ │ ├── scan lineitem
│ │ │ │ │ ├── save-table-name: q7_scan_15
│ │ │ │ │ ├── columns: l_orderkey:8(int!null) l_suppkey:10(int!null) l_extendedprice:13(float!null) l_discount:14(float!null) l_shipdate:18(date!null)
Expand Down
170 changes: 91 additions & 79 deletions pkg/sql/opt/props/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"fmt"
"io"
"math"
"sort"

"github.com/cockroachdb/cockroach/pkg/sql/opt"
"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
Expand All @@ -30,27 +31,7 @@ import (
type Histogram struct {
evalCtx *tree.EvalContext
col opt.ColumnID
buckets []HistogramBucket
}

// HistogramBucket contains the data for a single bucket in a Histogram. Note
// that NumEq, NumRange, and DistinctRange are floats so the statisticsBuilder
// can apply filters to the histogram.
type HistogramBucket struct {
// NumEq is the estimated number of values equal to UpperBound.
NumEq float64

// NumRange is the estimated number of values in this bucket not equal to
// UpperBound.
NumRange float64

// DistinctRange is the estimated number of distinct values in this bucket
// not equal to UpperBound.
DistinctRange float64

// UpperBound is the largest value in this bucket. The lower bound can be
// inferred based on the upper bound of the previous bucket in the histogram.
UpperBound tree.Datum
buckets []cat.HistogramBucket
}

func (h *Histogram) String() string {
Expand All @@ -67,21 +48,12 @@ func (h *Histogram) Init(
) {
h.evalCtx = evalCtx
h.col = col
if len(buckets) == 0 {
return
}
h.buckets = make([]HistogramBucket, len(buckets))
for i := range buckets {
h.buckets[i].NumEq = float64(buckets[i].NumEq)
h.buckets[i].NumRange = float64(buckets[i].NumRange)
h.buckets[i].DistinctRange = buckets[i].DistinctRange
h.buckets[i].UpperBound = buckets[i].UpperBound
}
h.buckets = buckets
}

// Copy returns a deep copy of the histogram.
func (h *Histogram) Copy() *Histogram {
buckets := make([]HistogramBucket, len(h.buckets))
buckets := make([]cat.HistogramBucket, len(h.buckets))
copy(buckets, h.buckets)
return &Histogram{
evalCtx: h.evalCtx,
Expand All @@ -97,7 +69,7 @@ func (h *Histogram) BucketCount() int {

// Bucket returns a pointer to the ith bucket in the histogram.
// i must be greater than or equal to 0 and less than BucketCount.
func (h *Histogram) Bucket(i int) *HistogramBucket {
func (h *Histogram) Bucket(i int) *cat.HistogramBucket {
return &h.buckets[i]
}

Expand Down Expand Up @@ -139,17 +111,17 @@ func (h *Histogram) maxDistinctValuesCount() float64 {
return 0
}

// The lower bound for the first bucket is the smallest possible value for
// the data type.
lowerBound, ok := h.buckets[0].UpperBound.Min(h.evalCtx)
if !ok {
lowerBound = h.buckets[0].UpperBound
// The first bucket always has a zero value for NumRange, so the lower bound
// of the histogram is the upper bound of the first bucket.
if h.Bucket(0).NumRange != 0 {
panic(errors.AssertionFailedf("the first bucket should have NumRange=0"))
}
lowerBound := h.Bucket(0).UpperBound

var count float64
for i := range h.buckets {
b := &h.buckets[i]
rng, ok := b.maxDistinctValuesInRange(lowerBound)
rng, ok := maxDistinctValuesInRange(lowerBound, b.UpperBound)

if ok && b.NumRange > rng {
count += rng
Expand All @@ -168,17 +140,17 @@ func (h *Histogram) maxDistinctValuesCount() float64 {
}

// maxDistinctValuesInRange returns the maximum number of distinct values in
// the range of the bucket (i.e., not including the upper bound). It returns
// ok=false when it is not possible to determine a finite value (which is the
// case for all types other than integers and dates).
func (b *HistogramBucket) maxDistinctValuesInRange(lowerBound tree.Datum) (_ float64, ok bool) {
// the range [lowerBound, upperBound). It returns ok=false when it is not
// possible to determine a finite value (which is the case for all types other
// than integers and dates).
func maxDistinctValuesInRange(lowerBound, upperBound tree.Datum) (_ float64, ok bool) {
switch lowerBound.ResolvedType().Family() {
case types.IntFamily:
return float64(*b.UpperBound.(*tree.DInt)) - float64(*lowerBound.(*tree.DInt)), true
return float64(*upperBound.(*tree.DInt)) - float64(*lowerBound.(*tree.DInt)), true

case types.DateFamily:
lower := lowerBound.(*tree.DDate)
upper := b.UpperBound.(*tree.DDate)
upper := upperBound.(*tree.DDate)
if lower.IsFinite() && upper.IsFinite() {
return float64(upper.PGEpochDays()) - float64(lower.PGEpochDays()), true
}
Expand Down Expand Up @@ -215,45 +187,66 @@ func (h *Histogram) Filter(c *constraint.Constraint) *Histogram {
panic(errors.AssertionFailedf("histogram filter with descending constraint not yet supported"))
}

bucketCount := h.BucketCount()
filtered := &Histogram{
evalCtx: h.evalCtx,
col: h.col,
buckets: make([]HistogramBucket, 0, len(h.buckets)),
buckets: make([]cat.HistogramBucket, 0, bucketCount),
}
if len(h.buckets) == 0 {
if bucketCount == 0 {
return filtered
}

// The lower bound for the first bucket is the smallest possible value for
// the data type.
// TODO(rytaft): Ensure that the first bucket has a zero value for NumRange,
// at least for types that don't have a Min.
lowerBound, ok := h.buckets[0].UpperBound.Min(h.evalCtx)
if !ok {
lowerBound = h.buckets[0].UpperBound
// The first bucket always has a zero value for NumRange, so the lower bound
// of the histogram is the upper bound of the first bucket.
if h.Bucket(0).NumRange != 0 {
panic(errors.AssertionFailedf("the first bucket should have NumRange=0"))
}

// Use variation on merge sort, because both sets of buckets and spans are
// ordered and non-overlapping.
// TODO(rytaft): use binary search to find the first bucket.
lowerBound := h.Bucket(0).UpperBound

bucIndex := 0
spanIndex := 0
keyCtx := constraint.KeyContext{EvalCtx: h.evalCtx}
keyCtx.Columns.InitSingle(opt.MakeOrderingColumn(h.col, false /* descending */))

for bucIndex < h.BucketCount() && spanIndex < c.Spans.Count() {
// Find the first span that may overlap with the histogram.
firstBucket := makeSpanFromBucket(h.Bucket(bucIndex), lowerBound)
spanCount := c.Spans.Count()
for spanIndex < spanCount {
span := c.Spans.Get(spanIndex)
if firstBucket.StartsAfter(&keyCtx, span) {
spanIndex++
continue
}
break
}
if spanIndex == spanCount {
return filtered
}

// Use binary search to find the first bucket that overlaps with the span.
span := c.Spans.Get(spanIndex)
bucIndex = sort.Search(bucketCount, func(i int) bool {
// The lower bound of the bucket doesn't matter here since we're just
// checking whether the span starts after the *upper bound* of the bucket.
bucket := makeSpanFromBucket(h.Bucket(i), lowerBound)
return !span.StartsAfter(&keyCtx, &bucket)
})
if bucIndex == bucketCount {
return filtered
}
if bucIndex > 0 {
prevUpperBound := h.Bucket(bucIndex - 1).UpperBound
filtered.addEmptyBucket(prevUpperBound)
lowerBound = h.getNextLowerBound(prevUpperBound)
}

// For the remaining buckets and spans, use a variation on merge sort.
for bucIndex < bucketCount && spanIndex < spanCount {
bucket := h.Bucket(bucIndex)
// Convert the bucket to a span in order to take advantage of the
// constraint library.
var left constraint.Span
left.Init(
constraint.MakeKey(lowerBound),
constraint.IncludeBoundary,
constraint.MakeKey(bucket.UpperBound),
constraint.IncludeBoundary,
)

left := makeSpanFromBucket(bucket, lowerBound)
right := c.Spans.Get(spanIndex)

if left.StartsAfter(&keyCtx, right) {
Expand All @@ -273,7 +266,7 @@ func (h *Histogram) Filter(c *constraint.Constraint) *Histogram {
if filteredSpan.Compare(&keyCtx, &left) != 0 {
// The bucket was cut off in the middle. Get the resulting filtered
// bucket.
filteredBucket = bucket.getFilteredBucket(&keyCtx, &filteredSpan, lowerBound)
filteredBucket = getFilteredBucket(bucket, &keyCtx, &filteredSpan, lowerBound)
if filteredSpan.CompareStarts(&keyCtx, &left) != 0 {
// We need to add an empty bucket before the new bucket.
emptyBucketUpperBound := filteredSpan.StartKey().Value(0)
Expand Down Expand Up @@ -311,10 +304,10 @@ func (h *Histogram) getNextLowerBound(currentUpperBound tree.Datum) tree.Datum {
}

func (h *Histogram) addEmptyBucket(upperBound tree.Datum) {
h.addBucket(&HistogramBucket{UpperBound: upperBound})
h.addBucket(&cat.HistogramBucket{UpperBound: upperBound})
}

func (h *Histogram) addBucket(bucket *HistogramBucket) {
func (h *Histogram) addBucket(bucket *cat.HistogramBucket) {
// Check whether we can combine this bucket with the previous bucket.
if len(h.buckets) != 0 {
lastBucket := &h.buckets[len(h.buckets)-1]
Expand All @@ -332,10 +325,11 @@ func (h *Histogram) addBucket(bucket *HistogramBucket) {
}

// ApplySelectivity reduces the size of each histogram bucket according to
// the given selectivity.
func (h *Histogram) ApplySelectivity(selectivity float64) {
for i := range h.buckets {
b := &h.buckets[i]
// the given selectivity, and returns a new histogram with the results.
func (h *Histogram) ApplySelectivity(selectivity float64) *Histogram {
res := h.Copy()
for i := range res.buckets {
b := &res.buckets[i]

// Save n and d for the distinct count formula below.
n := b.NumRange
Expand All @@ -356,6 +350,17 @@ func (h *Histogram) ApplySelectivity(selectivity float64) {
// when d << n.
b.DistinctRange = d - d*math.Pow(1-selectivity, n/d)
}
return res
}

func makeSpanFromBucket(b *cat.HistogramBucket, lowerBound tree.Datum) (span constraint.Span) {
span.Init(
constraint.MakeKey(lowerBound),
constraint.IncludeBoundary,
constraint.MakeKey(b.UpperBound),
constraint.IncludeBoundary,
)
return span
}

// getFilteredBucket filters the histogram bucket according to the given span,
Expand Down Expand Up @@ -389,9 +394,12 @@ func (h *Histogram) ApplySelectivity(selectivity float64) {
// the size of NumRange if the bucket is cut off in the middle. In this case,
// we use the heuristic that NumRange is reduced by half.
//
func (b *HistogramBucket) getFilteredBucket(
keyCtx *constraint.KeyContext, filteredSpan *constraint.Span, bucketLowerBound tree.Datum,
) *HistogramBucket {
func getFilteredBucket(
b *cat.HistogramBucket,
keyCtx *constraint.KeyContext,
filteredSpan *constraint.Span,
bucketLowerBound tree.Datum,
) *cat.HistogramBucket {
spanLowerBound := filteredSpan.StartKey().Value(0)
spanUpperBound := filteredSpan.EndKey().Value(0)

Expand Down Expand Up @@ -506,7 +514,7 @@ func (b *HistogramBucket) getFilteredBucket(
distinctCountRange = b.DistinctRange * numRange / b.NumRange
}

return &HistogramBucket{
return &cat.HistogramBucket{
NumEq: numEq,
NumRange: numRange,
DistinctRange: distinctCountRange,
Expand Down Expand Up @@ -537,7 +545,7 @@ const (
boundaries
)

func (w *histogramWriter) init(buckets []HistogramBucket) {
func (w *histogramWriter) init(buckets []cat.HistogramBucket) {
w.cells = [][]string{
make([]string, len(buckets)*2),
make([]string, len(buckets)*2),
Expand All @@ -562,6 +570,10 @@ func (w *histogramWriter) init(buckets []HistogramBucket) {
}

func (w *histogramWriter) write(out io.Writer) {
if len(w.cells[counts]) == 0 {
return
}

// Print a space to match up with the "<" character below.
fmt.Fprint(out, " ")
for i := range w.cells[counts] {
Expand Down
Loading

0 comments on commit 3243046

Please sign in to comment.