Skip to content

Commit

Permalink
Merge #39178
Browse files Browse the repository at this point in the history
39178: stats,opt: performance improvements for histograms r=rytaft a=rytaft

**stats,opt: move histogram bucket distinct count estimation code**
    
This commit moves the code to estimate the number of distinct values
per histogram bucket out of the optimizer and into the stats package.
The purpose of doing this is to remove the overhead of calculating the
number of distinct values per bucket from the critical path of query
planning, and instead perform the calculation offline when initially
generating the histogram.

**opt: performance improvements for histograms**
    
This commit improves the performance of histograms in the optimizer
by avoiding allocating and copying the full histogram unless strictly
necessary. Additionally, it changes the code for filtering histograms
to use binary search instead of performing a linear scan.

----

As a result of both of these commits, the overhead of histograms is reduced. Prior to these changes, the overhead of histograms caused 14% lower throughput and 29% higher latency when running kv with 95% reads. After, it caused only 4.6% lower throughput and 11% higher latency. (Clearly there is still work to do, but this is some progress...)

Co-authored-by: Rebecca Taft <becca@cockroachlabs.com>
  • Loading branch information
craig[bot] and rytaft committed Aug 1, 2019
2 parents 9e67a9c + 3243046 commit 05b9ca0
Show file tree
Hide file tree
Showing 20 changed files with 23,274 additions and 16,007 deletions.
7 changes: 5 additions & 2 deletions pkg/sql/distsqlrun/sample_aggregator.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ func (s *sampleAggregator) writeResults(ctx context.Context) error {
// closure.
if err := s.flowCtx.ClientDB.Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
for _, si := range s.sketches {
distinctCount := int64(si.sketch.Estimate())
var histogram *stats.HistogramData
if si.spec.GenerateHistogram && len(s.sr.Get()) != 0 {
colIdx := int(si.spec.Columns[0])
Expand All @@ -305,6 +306,7 @@ func (s *sampleAggregator) writeResults(ctx context.Context) error {
colIdx,
typ,
si.numRows,
distinctCount,
int(si.spec.HistogramMaxBuckets),
)
if err != nil {
Expand Down Expand Up @@ -338,7 +340,7 @@ func (s *sampleAggregator) writeResults(ctx context.Context) error {
si.spec.StatName,
columnIDs,
si.numRows,
int64(si.sketch.Estimate()),
distinctCount,
si.numNulls,
histogram,
); err != nil {
Expand All @@ -364,6 +366,7 @@ func generateHistogram(
colIdx int,
colType *types.T,
numRows int64,
distinctCount int64,
maxBuckets int,
) (stats.HistogramData, error) {
var da sqlbase.DatumAlloc
Expand All @@ -378,5 +381,5 @@ func generateHistogram(
values = append(values, ed.Datum)
}
}
return stats.EquiDepthHistogram(evalCtx, values, numRows, maxBuckets)
return stats.EquiDepthHistogram(evalCtx, values, numRows, distinctCount, maxBuckets)
}
24 changes: 12 additions & 12 deletions pkg/sql/logictest/testdata/logic_test/distsql_stats
Original file line number Diff line number Diff line change
Expand Up @@ -76,20 +76,20 @@ s1 {a} 10000 10 0 true
let $hist_id_1
SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE data] WHERE statistics_name = 's1'

query TII colnames
query TIRI colnames
SHOW HISTOGRAM $hist_id_1
----
upper_bound range_rows equal_rows
1 0 1000
2 0 1000
3 0 1000
4 0 1000
5 0 1000
6 0 1000
7 0 1000
8 0 1000
9 0 1000
10 0 1000
upper_bound range_rows distinct_range_rows equal_rows
1 0 0 1000
2 0 0 1000
3 0 0 1000
4 0 0 1000
5 0 0 1000
6 0 0 1000
7 0 0 1000
8 0 0 1000
9 0 0 1000
10 0 0 1000

statement ok
CREATE STATISTICS "" ON b FROM data
Expand Down
23 changes: 13 additions & 10 deletions pkg/sql/opt/cat/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,19 +182,22 @@ type TableStatistic interface {
Histogram() []HistogramBucket
}

// HistogramBucket contains the data for a single histogram bucket.
// HistogramBucket contains the data for a single histogram bucket. Note
// that NumEq, NumRange, and DistinctRange are floats so the statisticsBuilder
// can apply filters to the histogram.
type HistogramBucket struct {
// NumEq is the estimated number of values equal to UpperBound.
NumEq uint64
NumEq float64

// NumRange is the estimated number of values between the lower bound of the
// bucket and UpperBound (both boundaries are exclusive).
//
// The lower bound is inferred based on the location of this bucket in a
// slice of buckets. If it is the first bucket, the lower bound is the minimum
// possible value for the given data type. Otherwise, the lower bound is equal
// to the upper bound of the previous bucket.
NumRange uint64
// NumRange is the estimated number of values between the upper bound of the
// previous bucket and UpperBound (both boundaries are exclusive).
// The first bucket should always have NumRange=0.
NumRange float64

// DistinctRange is the estimated number of distinct values between the upper
// bound of the previous bucket and UpperBound (both boundaries are
// exclusive).
DistinctRange float64

// UpperBound is the upper bound of the bucket.
UpperBound tree.Datum
Expand Down
16 changes: 3 additions & 13 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
if cols.Len() == 1 && stat.Histogram() != nil {
col, _ := cols.Next(0)
colStat.Histogram = &props.Histogram{}
colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram(), colStat.DistinctCount)
colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram())
}

// Make sure the distinct count is at least 1, for the same reason as
Expand Down Expand Up @@ -554,23 +554,13 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro

inputColStat := sb.colStatTable(scan.Table, colSet)
colStat := sb.copyColStat(colSet, s, inputColStat)
if inputColStat.Histogram != nil {
colStat.Histogram = inputColStat.Histogram.Copy()
}
colStat.Histogram = inputColStat.Histogram

if s.Selectivity != 1 {
tableStats := sb.makeTableStatistics(scan.Table)
colStat.ApplySelectivity(s.Selectivity, tableStats.RowCount)
}

// Cap distinct and null counts at limit, if it exists.
if scan.HardLimit.IsSet() {
if limit := float64(scan.HardLimit.RowCount()); limit < s.RowCount {
colStat.DistinctCount = min(colStat.DistinctCount, limit)
colStat.NullCount = min(colStat.NullCount, limit)
}
}

if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
Expand Down Expand Up @@ -2215,7 +2205,7 @@ func (sb *statisticsBuilder) finalizeFromRowCount(
if colStat.Histogram != nil {
valuesCount := colStat.Histogram.ValuesCount()
if valuesCount > rowCount {
colStat.Histogram.ApplySelectivity(rowCount / valuesCount)
colStat.Histogram = colStat.Histogram.ApplySelectivity(rowCount / valuesCount)
}
}
}
Expand Down
96 changes: 47 additions & 49 deletions pkg/sql/opt/memo/testdata/stats/scan
Original file line number Diff line number Diff line change
Expand Up @@ -609,11 +609,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 40,
"histo_col_type": "int",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "0"},
{"num_eq": 10, "num_range": 90, "upper_bound": "10"},
{"num_eq": 20, "num_range": 180, "upper_bound": "20"},
{"num_eq": 30, "num_range": 270, "upper_bound": "30"},
{"num_eq": 40, "num_range": 360, "upper_bound": "40"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "0"},
{"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "10"},
{"num_eq": 20, "num_range": 180, "distinct_range": 9, "upper_bound": "20"},
{"num_eq": 30, "num_range": 270, "distinct_range": 9, "upper_bound": "30"},
{"num_eq": 40, "num_range": 360, "distinct_range": 9, "upper_bound": "40"}
]
},
{
Expand All @@ -623,11 +623,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 120,
"histo_col_type": "date",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "2018-06-30"},
{"num_eq": 10, "num_range": 90, "upper_bound": "2018-07-31"},
{"num_eq": 20, "num_range": 180, "upper_bound": "2018-08-31"},
{"num_eq": 30, "num_range": 270, "upper_bound": "2018-09-30"},
{"num_eq": 40, "num_range": 360, "upper_bound": "2018-10-31"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "2018-06-30"},
{"num_eq": 10, "num_range": 90, "distinct_range": 29, "upper_bound": "2018-07-31"},
{"num_eq": 20, "num_range": 180, "distinct_range": 29, "upper_bound": "2018-08-31"},
{"num_eq": 30, "num_range": 270, "distinct_range": 29, "upper_bound": "2018-09-30"},
{"num_eq": 40, "num_range": 360, "distinct_range": 29, "upper_bound": "2018-10-31"}
]
},
{
Expand All @@ -637,11 +637,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 45,
"histo_col_type": "decimal",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "0"},
{"num_eq": 10, "num_range": 90, "upper_bound": "10"},
{"num_eq": 20, "num_range": 180, "upper_bound": "20"},
{"num_eq": 30, "num_range": 270, "upper_bound": "30"},
{"num_eq": 40, "num_range": 360, "upper_bound": "40"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "0"},
{"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "10"},
{"num_eq": 20, "num_range": 180, "distinct_range": 10, "upper_bound": "20"},
{"num_eq": 30, "num_range": 270, "distinct_range": 11, "upper_bound": "30"},
{"num_eq": 40, "num_range": 360, "distinct_range": 11, "upper_bound": "40"}
]
},
{
Expand All @@ -651,11 +651,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 45,
"histo_col_type": "float",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "0"},
{"num_eq": 10, "num_range": 90, "upper_bound": "10"},
{"num_eq": 20, "num_range": 180, "upper_bound": "20"},
{"num_eq": 30, "num_range": 270, "upper_bound": "30"},
{"num_eq": 40, "num_range": 360, "upper_bound": "40"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "0"},
{"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "10"},
{"num_eq": 20, "num_range": 180, "distinct_range": 10, "upper_bound": "20"},
{"num_eq": 30, "num_range": 270, "distinct_range": 11, "upper_bound": "30"},
{"num_eq": 40, "num_range": 360, "distinct_range": 11, "upper_bound": "40"}
]
},
{
Expand All @@ -665,11 +665,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 200,
"histo_col_type": "timestamp",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "2018-06-30"},
{"num_eq": 10, "num_range": 90, "upper_bound": "2018-07-31"},
{"num_eq": 20, "num_range": 180, "upper_bound": "2018-08-31"},
{"num_eq": 30, "num_range": 270, "upper_bound": "2018-09-30"},
{"num_eq": 40, "num_range": 360, "upper_bound": "2018-10-31"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "2018-06-30"},
{"num_eq": 10, "num_range": 90, "distinct_range": 49, "upper_bound": "2018-07-31"},
{"num_eq": 20, "num_range": 180, "distinct_range": 49, "upper_bound": "2018-08-31"},
{"num_eq": 30, "num_range": 270, "distinct_range": 49, "upper_bound": "2018-09-30"},
{"num_eq": 40, "num_range": 360, "distinct_range": 49, "upper_bound": "2018-10-31"}
]
},
{
Expand All @@ -679,11 +679,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 200,
"histo_col_type": "timestamptz",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "2018-06-30"},
{"num_eq": 10, "num_range": 90, "upper_bound": "2018-07-31"},
{"num_eq": 20, "num_range": 180, "upper_bound": "2018-08-31"},
{"num_eq": 30, "num_range": 270, "upper_bound": "2018-09-30"},
{"num_eq": 40, "num_range": 360, "upper_bound": "2018-10-31"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "2018-06-30"},
{"num_eq": 10, "num_range": 90, "distinct_range": 49, "upper_bound": "2018-07-31"},
{"num_eq": 20, "num_range": 180, "distinct_range": 49, "upper_bound": "2018-08-31"},
{"num_eq": 30, "num_range": 270, "distinct_range": 49, "upper_bound": "2018-09-30"},
{"num_eq": 40, "num_range": 360, "distinct_range": 49, "upper_bound": "2018-10-31"}
]
},
{
Expand All @@ -693,11 +693,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 40,
"histo_col_type": "string",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "apple"},
{"num_eq": 10, "num_range": 90, "upper_bound": "banana"},
{"num_eq": 20, "num_range": 180, "upper_bound": "cherry"},
{"num_eq": 30, "num_range": 270, "upper_bound": "mango"},
{"num_eq": 40, "num_range": 360, "upper_bound": "pineapple"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "apple"},
{"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "banana"},
{"num_eq": 20, "num_range": 180, "distinct_range": 9, "upper_bound": "cherry"},
{"num_eq": 30, "num_range": 270, "distinct_range": 9, "upper_bound": "mango"},
{"num_eq": 40, "num_range": 360, "distinct_range": 9, "upper_bound": "pineapple"}
]
}
]'
Expand All @@ -709,13 +709,13 @@ SELECT * FROM hist WHERE a < 10
----
index-join hist
├── columns: a:1(int!null) b:2(date) c:3(decimal) d:4(float) e:5(timestamp) f:6(timestamptz) g:7(string)
├── stats: [rows=90, distinct(1)=8.99985057, null(1)=0]
├── stats: [rows=90, distinct(1)=9, null(1)=0]
│ histogram(1)= 0 0 80 10
│ <--- 0 ---- 9
└── scan hist@idx_a
├── columns: a:1(int!null) rowid:8(int!null)
├── constraint: /1/8: (/NULL - /9]
├── stats: [rows=90, distinct(1)=8.99985057, null(1)=0, distinct(8)=90, null(8)=0]
├── stats: [rows=90, distinct(1)=9, null(1)=0, distinct(8)=90, null(8)=0]
│ histogram(1)= 0 0 80 10
│ <--- 0 ---- 9
├── key: (8)
Expand Down Expand Up @@ -743,13 +743,13 @@ SELECT * FROM hist WHERE b > '2018-07-31'::DATE AND b < '2018-08-05'::DATE
----
index-join hist
├── columns: a:1(int) b:2(date!null) c:3(decimal) d:4(float) e:5(timestamp) f:6(timestamptz) g:7(string)
├── stats: [rows=24, distinct(2)=3.95478935, null(2)=0]
├── stats: [rows=24, distinct(2)=3.9, null(2)=0]
│ histogram(2)= 0 0 18 6
│ <--- '2018-07-31' ---- '2018-08-04'
└── scan hist@idx_b
├── columns: b:2(date!null) rowid:8(int!null)
├── constraint: /2/8: [/'2018-08-01' - /'2018-08-04']
├── stats: [rows=24, distinct(2)=3.95478935, null(2)=0, distinct(8)=24, null(8)=0]
├── stats: [rows=24, distinct(2)=3.9, null(2)=0, distinct(8)=24, null(8)=0]
│ histogram(2)= 0 0 18 6
│ <--- '2018-07-31' ---- '2018-08-04'
├── key: (8)
Expand All @@ -764,7 +764,7 @@ index-join hist
└── scan hist@idx_c
├── columns: c:3(decimal!null) rowid:8(int!null)
├── constraint: /3/8: (/NULL - /10) [/20 - /20]
├── stats: [rows=110, distinct(3)=5.1, null(3)=0, distinct(8)=110, null(8)=0]
├── stats: [rows=110, distinct(3)=10, null(3)=0, distinct(8)=110, null(8)=0]
│ histogram(3)= 0 0 90 0 0 20
│ <--- 0 ---- 10 --- 20
├── key: (8)
Expand All @@ -779,7 +779,7 @@ index-join hist
└── scan hist@idx_c
├── columns: c:3(decimal!null) rowid:8(int!null)
├── constraint: /3/8: (/NULL - /10] [/20 - /20]
├── stats: [rows=120, distinct(3)=6.1, null(3)=0, distinct(8)=120, null(8)=0]
├── stats: [rows=120, distinct(3)=11, null(3)=0, distinct(8)=120, null(8)=0]
│ histogram(3)= 0 0 90 10 0 20
│ <--- 0 ---- 10 --- 20
├── key: (8)
Expand All @@ -793,13 +793,13 @@ index-join hist
├── stats: [rows=333.333333]
└── select
├── columns: d:4(float!null) rowid:8(int!null)
├── stats: [rows=61.6666667, distinct(4)=8.14917966, null(4)=0, distinct(8)=61.6666667, null(8)=0]
├── stats: [rows=61.6666667, distinct(4)=11.4830985, null(4)=0, distinct(8)=61.6666667, null(8)=0]
├── key: (8)
├── fd: (8)-->(4)
├── scan hist@idx_d
│ ├── columns: d:4(float!null) rowid:8(int!null)
│ ├── constraint: /4/8: [/5.0 - /14.999999999999998] [/40.0 - ]
│ ├── stats: [rows=185, distinct(4)=8.15, null(4)=0, distinct(8)=185, null(8)=0]
│ ├── stats: [rows=185, distinct(4)=11.5, null(4)=0, distinct(8)=185, null(8)=0]
│ │ histogram(4)= 0 0 45 10 90 0 0 40
│ │ <--- 4.999999999999999 ---- 10.0 ---- 14.999999999999998 --- 40.0
│ ├── key: (8)
Expand All @@ -812,13 +812,13 @@ SELECT * FROM hist WHERE e < '2018-07-31 23:00:00'::TIMESTAMP
----
index-join hist
├── columns: a:1(int) b:2(date) c:3(decimal) d:4(float) e:5(timestamp!null) f:6(timestamptz) g:7(string)
├── stats: [rows=105.564516, distinct(5)=21.811828, null(5)=0]
├── stats: [rows=105.564516, distinct(5)=51.5147849, null(5)=0]
│ histogram(5)= 0 0 90 10 5.5645 0
│ <--- '2018-06-30 00:00:00+00:00' ---- '2018-07-31 00:00:00+00:00' -------- '2018-07-31 22:59:59.999999+00:00'
└── scan hist@idx_e
├── columns: e:5(timestamp!null) rowid:8(int!null)
├── constraint: /5/8: (/NULL - /'2018-07-31 22:59:59.999999+00:00']
├── stats: [rows=105.564516, distinct(5)=21.811828, null(5)=0, distinct(8)=105.564516, null(8)=0]
├── stats: [rows=105.564516, distinct(5)=51.5147849, null(5)=0, distinct(8)=105.564516, null(8)=0]
│ histogram(5)= 0 0 90 10 5.5645 0
│ <--- '2018-06-30 00:00:00+00:00' ---- '2018-07-31 00:00:00+00:00' -------- '2018-07-31 22:59:59.999999+00:00'
├── key: (8)
Expand All @@ -830,15 +830,13 @@ SELECT * FROM hist WHERE f = '2019-10-30 23:00:00'::TIMESTAMPTZ
index-join hist
├── columns: a:1(int) b:2(date) c:3(decimal) d:4(float) e:5(timestamp) f:6(timestamptz!null) g:7(string)
├── stats: [rows=1, distinct(6)=1, null(6)=0]
│ histogram(6)= 0 0
│ <--- '2018-10-31 00:00:00+00:00'
│ histogram(6)=
├── fd: ()-->(6)
└── scan hist@idx_f
├── columns: f:6(timestamptz!null) rowid:8(int!null)
├── constraint: /6/8: [/'2019-10-30 23:00:00+00:00' - /'2019-10-30 23:00:00+00:00']
├── stats: [rows=1, distinct(6)=1, null(6)=0, distinct(8)=1, null(8)=0]
│ histogram(6)= 0 0
│ <--- '2018-10-31 00:00:00+00:00'
│ histogram(6)=
├── key: (8)
└── fd: ()-->(6)

Expand Down
Loading

0 comments on commit 05b9ca0

Please sign in to comment.