Skip to content

Commit

Permalink
opt: performance improvements for histograms
Browse files Browse the repository at this point in the history
This commit improves the performance of histograms in the optimizer
by avoiding allocating and copying the full histogram unless strictly
necessary. Additionally, it changes the code for filtering histograms
to use binary search instead of performing a linear scan.

Release note: None
  • Loading branch information
rytaft committed Jul 30, 2019
1 parent 9e29ebe commit 80b804d
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 129 deletions.
8 changes: 5 additions & 3 deletions pkg/sql/opt/cat/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,15 +182,17 @@ type TableStatistic interface {
Histogram() []HistogramBucket
}

// HistogramBucket contains the data for a single histogram bucket.
// HistogramBucket contains the data for a single histogram bucket. Note
// that NumEq, NumRange, and DistinctRange are floats so the statisticsBuilder
// can apply filters to the histogram.
type HistogramBucket struct {
// NumEq is the estimated number of values equal to UpperBound.
NumEq uint64
NumEq float64

// NumRange is the estimated number of values between the upper bound of the
// previous bucket and UpperBound (both boundaries are exclusive).
// The first bucket should always have NumRange=0.
NumRange uint64
NumRange float64

// DistinctRange is the estimated number of distinct values between the upper
// bound of the previous bucket and UpperBound (both boundaries are
Expand Down
8 changes: 7 additions & 1 deletion pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,13 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro
inputColStat := sb.colStatTable(scan.Table, colSet)
colStat := sb.copyColStat(colSet, s, inputColStat)
if inputColStat.Histogram != nil {
colStat.Histogram = inputColStat.Histogram.Copy()
if s.Selectivity != 1 || scan.HardLimit.IsSet() {
colStat.Histogram = inputColStat.Histogram.Copy()
} else {
// We don't need to make a deep copy of the histogram because we won't be
// modifying it.
colStat.Histogram = inputColStat.Histogram
}
}

if s.Selectivity != 1 {
Expand Down
160 changes: 85 additions & 75 deletions pkg/sql/opt/props/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"fmt"
"io"
"math"
"sort"

"github.com/cockroachdb/cockroach/pkg/sql/opt"
"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
Expand All @@ -30,27 +31,7 @@ import (
type Histogram struct {
evalCtx *tree.EvalContext
col opt.ColumnID
buckets []HistogramBucket
}

// HistogramBucket contains the data for a single bucket in a Histogram. Note
// that NumEq, NumRange, and DistinctRange are floats so the statisticsBuilder
// can apply filters to the histogram.
type HistogramBucket struct {
// NumEq is the estimated number of values equal to UpperBound.
NumEq float64

// NumRange is the estimated number of values in this bucket not equal to
// UpperBound.
NumRange float64

// DistinctRange is the estimated number of distinct values in this bucket
// not equal to UpperBound.
DistinctRange float64

// UpperBound is the largest value in this bucket. The lower bound can be
// inferred based on the upper bound of the previous bucket in the histogram.
UpperBound tree.Datum
buckets []cat.HistogramBucket
}

func (h *Histogram) String() string {
Expand All @@ -67,21 +48,12 @@ func (h *Histogram) Init(
) {
h.evalCtx = evalCtx
h.col = col
if len(buckets) == 0 {
return
}
h.buckets = make([]HistogramBucket, len(buckets))
for i := range buckets {
h.buckets[i].NumEq = float64(buckets[i].NumEq)
h.buckets[i].NumRange = float64(buckets[i].NumRange)
h.buckets[i].DistinctRange = buckets[i].DistinctRange
h.buckets[i].UpperBound = buckets[i].UpperBound
}
h.buckets = buckets
}

// Copy returns a deep copy of the histogram.
func (h *Histogram) Copy() *Histogram {
buckets := make([]HistogramBucket, len(h.buckets))
buckets := make([]cat.HistogramBucket, len(h.buckets))
copy(buckets, h.buckets)
return &Histogram{
evalCtx: h.evalCtx,
Expand All @@ -97,7 +69,7 @@ func (h *Histogram) BucketCount() int {

// Bucket returns a pointer to the ith bucket in the histogram.
// i must be greater than or equal to 0 and less than BucketCount.
func (h *Histogram) Bucket(i int) *HistogramBucket {
func (h *Histogram) Bucket(i int) *cat.HistogramBucket {
return &h.buckets[i]
}

Expand Down Expand Up @@ -139,17 +111,17 @@ func (h *Histogram) maxDistinctValuesCount() float64 {
return 0
}

// The lower bound for the first bucket is the smallest possible value for
// the data type.
lowerBound, ok := h.buckets[0].UpperBound.Min(h.evalCtx)
if !ok {
lowerBound = h.buckets[0].UpperBound
// The first bucket always has a zero value for NumRange, so the lower bound
// of the histogram is the upper bound of the first bucket.
if h.Bucket(0).NumRange != 0 {
panic(errors.AssertionFailedf("the first bucket should have NumRange=0"))
}
lowerBound := h.Bucket(0).UpperBound

var count float64
for i := range h.buckets {
b := &h.buckets[i]
rng, ok := b.maxDistinctValuesInRange(lowerBound)
rng, ok := maxDistinctValuesInRange(lowerBound, b.UpperBound)

if ok && b.NumRange > rng {
count += rng
Expand All @@ -168,17 +140,17 @@ func (h *Histogram) maxDistinctValuesCount() float64 {
}

// maxDistinctValuesInRange returns the maximum number of distinct values in
// the range of the bucket (i.e., not including the upper bound). It returns
// ok=false when it is not possible to determine a finite value (which is the
// case for all types other than integers and dates).
func (b *HistogramBucket) maxDistinctValuesInRange(lowerBound tree.Datum) (_ float64, ok bool) {
// the range [lowerBound, upperBound). It returns ok=false when it is not
// possible to determine a finite value (which is the case for all types other
// than integers and dates).
func maxDistinctValuesInRange(lowerBound, upperBound tree.Datum) (_ float64, ok bool) {
switch lowerBound.ResolvedType().Family() {
case types.IntFamily:
return float64(*b.UpperBound.(*tree.DInt)) - float64(*lowerBound.(*tree.DInt)), true
return float64(*upperBound.(*tree.DInt)) - float64(*lowerBound.(*tree.DInt)), true

case types.DateFamily:
lower := lowerBound.(*tree.DDate)
upper := b.UpperBound.(*tree.DDate)
upper := upperBound.(*tree.DDate)
if lower.IsFinite() && upper.IsFinite() {
return float64(upper.PGEpochDays()) - float64(lower.PGEpochDays()), true
}
Expand Down Expand Up @@ -215,45 +187,66 @@ func (h *Histogram) Filter(c *constraint.Constraint) *Histogram {
panic(errors.AssertionFailedf("histogram filter with descending constraint not yet supported"))
}

bucketCount := h.BucketCount()
filtered := &Histogram{
evalCtx: h.evalCtx,
col: h.col,
buckets: make([]HistogramBucket, 0, len(h.buckets)),
buckets: make([]cat.HistogramBucket, 0, bucketCount),
}
if len(h.buckets) == 0 {
if bucketCount == 0 {
return filtered
}

// The lower bound for the first bucket is the smallest possible value for
// the data type.
// TODO(rytaft): Ensure that the first bucket has a zero value for NumRange,
// at least for types that don't have a Min.
lowerBound, ok := h.buckets[0].UpperBound.Min(h.evalCtx)
if !ok {
lowerBound = h.buckets[0].UpperBound
// The first bucket always has a zero value for NumRange, so the lower bound
// of the histogram is the upper bound of the first bucket.
if h.Bucket(0).NumRange != 0 {
panic(errors.AssertionFailedf("the first bucket should have NumRange=0"))
}

// Use variation on merge sort, because both sets of buckets and spans are
// ordered and non-overlapping.
// TODO(rytaft): use binary search to find the first bucket.
lowerBound := h.Bucket(0).UpperBound

bucIndex := 0
spanIndex := 0
keyCtx := constraint.KeyContext{EvalCtx: h.evalCtx}
keyCtx.Columns.InitSingle(opt.MakeOrderingColumn(h.col, false /* descending */))

for bucIndex < h.BucketCount() && spanIndex < c.Spans.Count() {
// Find the first span that may overlap with the histogram.
firstBucket := makeSpanFromBucket(h.Bucket(bucIndex), lowerBound)
spanCount := c.Spans.Count()
for spanIndex < spanCount {
span := c.Spans.Get(spanIndex)
if firstBucket.StartsAfter(&keyCtx, span) {
spanIndex++
continue
}
break
}
if spanIndex == spanCount {
return filtered
}

// Use binary search to find the first bucket that overlaps with the span.
span := c.Spans.Get(spanIndex)
bucIndex = sort.Search(bucketCount, func(i int) bool {
// The lower bound of the bucket doesn't matter here since we're just
// checking whether the span starts after the *upper bound* of the bucket.
bucket := makeSpanFromBucket(h.Bucket(i), lowerBound)
return !span.StartsAfter(&keyCtx, &bucket)
})
if bucIndex == bucketCount {
return filtered
}
if bucIndex > 0 {
prevUpperBound := h.Bucket(bucIndex - 1).UpperBound
filtered.addEmptyBucket(prevUpperBound)
lowerBound = h.getNextLowerBound(prevUpperBound)
}

// For the remaining buckets and spans, use a variation on merge sort.
for bucIndex < bucketCount && spanIndex < spanCount {
bucket := h.Bucket(bucIndex)
// Convert the bucket to a span in order to take advantage of the
// constraint library.
var left constraint.Span
left.Init(
constraint.MakeKey(lowerBound),
constraint.IncludeBoundary,
constraint.MakeKey(bucket.UpperBound),
constraint.IncludeBoundary,
)

left := makeSpanFromBucket(bucket, lowerBound)
right := c.Spans.Get(spanIndex)

if left.StartsAfter(&keyCtx, right) {
Expand All @@ -273,7 +266,7 @@ func (h *Histogram) Filter(c *constraint.Constraint) *Histogram {
if filteredSpan.Compare(&keyCtx, &left) != 0 {
// The bucket was cut off in the middle. Get the resulting filtered
// bucket.
filteredBucket = bucket.getFilteredBucket(&keyCtx, &filteredSpan, lowerBound)
filteredBucket = getFilteredBucket(bucket, &keyCtx, &filteredSpan, lowerBound)
if filteredSpan.CompareStarts(&keyCtx, &left) != 0 {
// We need to add an empty bucket before the new bucket.
emptyBucketUpperBound := filteredSpan.StartKey().Value(0)
Expand Down Expand Up @@ -311,10 +304,10 @@ func (h *Histogram) getNextLowerBound(currentUpperBound tree.Datum) tree.Datum {
}

func (h *Histogram) addEmptyBucket(upperBound tree.Datum) {
h.addBucket(&HistogramBucket{UpperBound: upperBound})
h.addBucket(&cat.HistogramBucket{UpperBound: upperBound})
}

func (h *Histogram) addBucket(bucket *HistogramBucket) {
func (h *Histogram) addBucket(bucket *cat.HistogramBucket) {
// Check whether we can combine this bucket with the previous bucket.
if len(h.buckets) != 0 {
lastBucket := &h.buckets[len(h.buckets)-1]
Expand Down Expand Up @@ -358,6 +351,16 @@ func (h *Histogram) ApplySelectivity(selectivity float64) {
}
}

func makeSpanFromBucket(b *cat.HistogramBucket, lowerBound tree.Datum) (span constraint.Span) {
span.Init(
constraint.MakeKey(lowerBound),
constraint.IncludeBoundary,
constraint.MakeKey(b.UpperBound),
constraint.IncludeBoundary,
)
return span
}

// getFilteredBucket filters the histogram bucket according to the given span,
// and returns a new bucket with the results. The span represents the maximum
// range of values that remain in the bucket after filtering. The span must
Expand Down Expand Up @@ -389,9 +392,12 @@ func (h *Histogram) ApplySelectivity(selectivity float64) {
// the size of NumRange if the bucket is cut off in the middle. In this case,
// we use the heuristic that NumRange is reduced by half.
//
func (b *HistogramBucket) getFilteredBucket(
keyCtx *constraint.KeyContext, filteredSpan *constraint.Span, bucketLowerBound tree.Datum,
) *HistogramBucket {
func getFilteredBucket(
b *cat.HistogramBucket,
keyCtx *constraint.KeyContext,
filteredSpan *constraint.Span,
bucketLowerBound tree.Datum,
) *cat.HistogramBucket {
spanLowerBound := filteredSpan.StartKey().Value(0)
spanUpperBound := filteredSpan.EndKey().Value(0)

Expand Down Expand Up @@ -506,7 +512,7 @@ func (b *HistogramBucket) getFilteredBucket(
distinctCountRange = b.DistinctRange * numRange / b.NumRange
}

return &HistogramBucket{
return &cat.HistogramBucket{
NumEq: numEq,
NumRange: numRange,
DistinctRange: distinctCountRange,
Expand Down Expand Up @@ -537,7 +543,7 @@ const (
boundaries
)

func (w *histogramWriter) init(buckets []HistogramBucket) {
func (w *histogramWriter) init(buckets []cat.HistogramBucket) {
w.cells = [][]string{
make([]string, len(buckets)*2),
make([]string, len(buckets)*2),
Expand All @@ -562,6 +568,10 @@ func (w *histogramWriter) init(buckets []HistogramBucket) {
}

func (w *histogramWriter) write(out io.Writer) {
if len(w.cells[counts]) == 0 {
return
}

// Print a space to match up with the "<" character below.
fmt.Fprint(out, " ")
for i := range w.cells[counts] {
Expand Down
Loading

0 comments on commit 80b804d

Please sign in to comment.