Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats,opt: performance improvements for histograms #39178

Merged
merged 2 commits into from
Aug 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions pkg/sql/distsqlrun/sample_aggregator.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ func (s *sampleAggregator) writeResults(ctx context.Context) error {
// closure.
if err := s.flowCtx.ClientDB.Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
for _, si := range s.sketches {
distinctCount := int64(si.sketch.Estimate())
var histogram *stats.HistogramData
if si.spec.GenerateHistogram && len(s.sr.Get()) != 0 {
colIdx := int(si.spec.Columns[0])
Expand All @@ -305,6 +306,7 @@ func (s *sampleAggregator) writeResults(ctx context.Context) error {
colIdx,
typ,
si.numRows,
distinctCount,
int(si.spec.HistogramMaxBuckets),
)
if err != nil {
Expand Down Expand Up @@ -338,7 +340,7 @@ func (s *sampleAggregator) writeResults(ctx context.Context) error {
si.spec.StatName,
columnIDs,
si.numRows,
int64(si.sketch.Estimate()),
distinctCount,
si.numNulls,
histogram,
); err != nil {
Expand All @@ -364,6 +366,7 @@ func generateHistogram(
colIdx int,
colType *types.T,
numRows int64,
distinctCount int64,
maxBuckets int,
) (stats.HistogramData, error) {
var da sqlbase.DatumAlloc
Expand All @@ -378,5 +381,5 @@ func generateHistogram(
values = append(values, ed.Datum)
}
}
return stats.EquiDepthHistogram(evalCtx, values, numRows, maxBuckets)
return stats.EquiDepthHistogram(evalCtx, values, numRows, distinctCount, maxBuckets)
}
24 changes: 12 additions & 12 deletions pkg/sql/logictest/testdata/logic_test/distsql_stats
Original file line number Diff line number Diff line change
Expand Up @@ -76,20 +76,20 @@ s1 {a} 10000 10 0 true
let $hist_id_1
SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE data] WHERE statistics_name = 's1'

query TII colnames
query TIRI colnames
SHOW HISTOGRAM $hist_id_1
----
upper_bound range_rows equal_rows
1 0 1000
2 0 1000
3 0 1000
4 0 1000
5 0 1000
6 0 1000
7 0 1000
8 0 1000
9 0 1000
10 0 1000
upper_bound range_rows distinct_range_rows equal_rows
1 0 0 1000
2 0 0 1000
3 0 0 1000
4 0 0 1000
5 0 0 1000
6 0 0 1000
7 0 0 1000
8 0 0 1000
9 0 0 1000
10 0 0 1000

statement ok
CREATE STATISTICS "" ON b FROM data
Expand Down
23 changes: 13 additions & 10 deletions pkg/sql/opt/cat/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,19 +182,22 @@ type TableStatistic interface {
Histogram() []HistogramBucket
}

// HistogramBucket contains the data for a single histogram bucket.
// HistogramBucket contains the data for a single histogram bucket. Note
// that NumEq, NumRange, and DistinctRange are floats so the statisticsBuilder
// can apply filters to the histogram.
type HistogramBucket struct {
// NumEq is the estimated number of values equal to UpperBound.
NumEq uint64
NumEq float64

// NumRange is the estimated number of values between the lower bound of the
// bucket and UpperBound (both boundaries are exclusive).
//
// The lower bound is inferred based on the location of this bucket in a
// slice of buckets. If it is the first bucket, the lower bound is the minimum
// possible value for the given data type. Otherwise, the lower bound is equal
// to the upper bound of the previous bucket.
NumRange uint64
// NumRange is the estimated number of values between the upper bound of the
// previous bucket and UpperBound (both boundaries are exclusive).
// The first bucket should always have NumRange=0.
NumRange float64

// DistinctRange is the estimated number of distinct values between the upper
// bound of the previous bucket and UpperBound (both boundaries are
// exclusive).
DistinctRange float64

// UpperBound is the upper bound of the bucket.
UpperBound tree.Datum
Expand Down
16 changes: 3 additions & 13 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
if cols.Len() == 1 && stat.Histogram() != nil {
col, _ := cols.Next(0)
colStat.Histogram = &props.Histogram{}
colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram(), colStat.DistinctCount)
colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram())
}

// Make sure the distinct count is at least 1, for the same reason as
Expand Down Expand Up @@ -554,23 +554,13 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro

inputColStat := sb.colStatTable(scan.Table, colSet)
colStat := sb.copyColStat(colSet, s, inputColStat)
if inputColStat.Histogram != nil {
colStat.Histogram = inputColStat.Histogram.Copy()
}
colStat.Histogram = inputColStat.Histogram

if s.Selectivity != 1 {
tableStats := sb.makeTableStatistics(scan.Table)
colStat.ApplySelectivity(s.Selectivity, tableStats.RowCount)
}

// Cap distinct and null counts at limit, if it exists.
if scan.HardLimit.IsSet() {
if limit := float64(scan.HardLimit.RowCount()); limit < s.RowCount {
colStat.DistinctCount = min(colStat.DistinctCount, limit)
colStat.NullCount = min(colStat.NullCount, limit)
}
}

if colSet.SubsetOf(relProps.NotNullCols) {
colStat.NullCount = 0
}
Expand Down Expand Up @@ -2215,7 +2205,7 @@ func (sb *statisticsBuilder) finalizeFromRowCount(
if colStat.Histogram != nil {
valuesCount := colStat.Histogram.ValuesCount()
if valuesCount > rowCount {
colStat.Histogram.ApplySelectivity(rowCount / valuesCount)
colStat.Histogram = colStat.Histogram.ApplySelectivity(rowCount / valuesCount)
}
}
}
Expand Down
96 changes: 47 additions & 49 deletions pkg/sql/opt/memo/testdata/stats/scan
Original file line number Diff line number Diff line change
Expand Up @@ -609,11 +609,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 40,
"histo_col_type": "int",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "0"},
{"num_eq": 10, "num_range": 90, "upper_bound": "10"},
{"num_eq": 20, "num_range": 180, "upper_bound": "20"},
{"num_eq": 30, "num_range": 270, "upper_bound": "30"},
{"num_eq": 40, "num_range": 360, "upper_bound": "40"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "0"},
{"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "10"},
{"num_eq": 20, "num_range": 180, "distinct_range": 9, "upper_bound": "20"},
{"num_eq": 30, "num_range": 270, "distinct_range": 9, "upper_bound": "30"},
{"num_eq": 40, "num_range": 360, "distinct_range": 9, "upper_bound": "40"}
]
},
{
Expand All @@ -623,11 +623,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 120,
"histo_col_type": "date",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "2018-06-30"},
{"num_eq": 10, "num_range": 90, "upper_bound": "2018-07-31"},
{"num_eq": 20, "num_range": 180, "upper_bound": "2018-08-31"},
{"num_eq": 30, "num_range": 270, "upper_bound": "2018-09-30"},
{"num_eq": 40, "num_range": 360, "upper_bound": "2018-10-31"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "2018-06-30"},
{"num_eq": 10, "num_range": 90, "distinct_range": 29, "upper_bound": "2018-07-31"},
{"num_eq": 20, "num_range": 180, "distinct_range": 29, "upper_bound": "2018-08-31"},
{"num_eq": 30, "num_range": 270, "distinct_range": 29, "upper_bound": "2018-09-30"},
{"num_eq": 40, "num_range": 360, "distinct_range": 29, "upper_bound": "2018-10-31"}
]
},
{
Expand All @@ -637,11 +637,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 45,
"histo_col_type": "decimal",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "0"},
{"num_eq": 10, "num_range": 90, "upper_bound": "10"},
{"num_eq": 20, "num_range": 180, "upper_bound": "20"},
{"num_eq": 30, "num_range": 270, "upper_bound": "30"},
{"num_eq": 40, "num_range": 360, "upper_bound": "40"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "0"},
{"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "10"},
{"num_eq": 20, "num_range": 180, "distinct_range": 10, "upper_bound": "20"},
{"num_eq": 30, "num_range": 270, "distinct_range": 11, "upper_bound": "30"},
{"num_eq": 40, "num_range": 360, "distinct_range": 11, "upper_bound": "40"}
]
},
{
Expand All @@ -651,11 +651,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 45,
"histo_col_type": "float",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "0"},
{"num_eq": 10, "num_range": 90, "upper_bound": "10"},
{"num_eq": 20, "num_range": 180, "upper_bound": "20"},
{"num_eq": 30, "num_range": 270, "upper_bound": "30"},
{"num_eq": 40, "num_range": 360, "upper_bound": "40"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "0"},
{"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "10"},
{"num_eq": 20, "num_range": 180, "distinct_range": 10, "upper_bound": "20"},
{"num_eq": 30, "num_range": 270, "distinct_range": 11, "upper_bound": "30"},
{"num_eq": 40, "num_range": 360, "distinct_range": 11, "upper_bound": "40"}
]
},
{
Expand All @@ -665,11 +665,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 200,
"histo_col_type": "timestamp",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "2018-06-30"},
{"num_eq": 10, "num_range": 90, "upper_bound": "2018-07-31"},
{"num_eq": 20, "num_range": 180, "upper_bound": "2018-08-31"},
{"num_eq": 30, "num_range": 270, "upper_bound": "2018-09-30"},
{"num_eq": 40, "num_range": 360, "upper_bound": "2018-10-31"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "2018-06-30"},
{"num_eq": 10, "num_range": 90, "distinct_range": 49, "upper_bound": "2018-07-31"},
{"num_eq": 20, "num_range": 180, "distinct_range": 49, "upper_bound": "2018-08-31"},
{"num_eq": 30, "num_range": 270, "distinct_range": 49, "upper_bound": "2018-09-30"},
{"num_eq": 40, "num_range": 360, "distinct_range": 49, "upper_bound": "2018-10-31"}
]
},
{
Expand All @@ -679,11 +679,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 200,
"histo_col_type": "timestamptz",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "2018-06-30"},
{"num_eq": 10, "num_range": 90, "upper_bound": "2018-07-31"},
{"num_eq": 20, "num_range": 180, "upper_bound": "2018-08-31"},
{"num_eq": 30, "num_range": 270, "upper_bound": "2018-09-30"},
{"num_eq": 40, "num_range": 360, "upper_bound": "2018-10-31"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "2018-06-30"},
{"num_eq": 10, "num_range": 90, "distinct_range": 49, "upper_bound": "2018-07-31"},
{"num_eq": 20, "num_range": 180, "distinct_range": 49, "upper_bound": "2018-08-31"},
{"num_eq": 30, "num_range": 270, "distinct_range": 49, "upper_bound": "2018-09-30"},
{"num_eq": 40, "num_range": 360, "distinct_range": 49, "upper_bound": "2018-10-31"}
]
},
{
Expand All @@ -693,11 +693,11 @@ ALTER TABLE hist INJECT STATISTICS '[
"distinct_count": 40,
"histo_col_type": "string",
"histo_buckets": [
{"num_eq": 0, "num_range": 0, "upper_bound": "apple"},
{"num_eq": 10, "num_range": 90, "upper_bound": "banana"},
{"num_eq": 20, "num_range": 180, "upper_bound": "cherry"},
{"num_eq": 30, "num_range": 270, "upper_bound": "mango"},
{"num_eq": 40, "num_range": 360, "upper_bound": "pineapple"}
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "apple"},
{"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "banana"},
{"num_eq": 20, "num_range": 180, "distinct_range": 9, "upper_bound": "cherry"},
{"num_eq": 30, "num_range": 270, "distinct_range": 9, "upper_bound": "mango"},
{"num_eq": 40, "num_range": 360, "distinct_range": 9, "upper_bound": "pineapple"}
]
}
]'
Expand All @@ -709,13 +709,13 @@ SELECT * FROM hist WHERE a < 10
----
index-join hist
├── columns: a:1(int!null) b:2(date) c:3(decimal) d:4(float) e:5(timestamp) f:6(timestamptz) g:7(string)
├── stats: [rows=90, distinct(1)=8.99985057, null(1)=0]
├── stats: [rows=90, distinct(1)=9, null(1)=0]
│ histogram(1)= 0 0 80 10
│ <--- 0 ---- 9
└── scan hist@idx_a
├── columns: a:1(int!null) rowid:8(int!null)
├── constraint: /1/8: (/NULL - /9]
├── stats: [rows=90, distinct(1)=8.99985057, null(1)=0, distinct(8)=90, null(8)=0]
├── stats: [rows=90, distinct(1)=9, null(1)=0, distinct(8)=90, null(8)=0]
│ histogram(1)= 0 0 80 10
│ <--- 0 ---- 9
├── key: (8)
Expand Down Expand Up @@ -743,13 +743,13 @@ SELECT * FROM hist WHERE b > '2018-07-31'::DATE AND b < '2018-08-05'::DATE
----
index-join hist
├── columns: a:1(int) b:2(date!null) c:3(decimal) d:4(float) e:5(timestamp) f:6(timestamptz) g:7(string)
├── stats: [rows=24, distinct(2)=3.95478935, null(2)=0]
├── stats: [rows=24, distinct(2)=3.9, null(2)=0]
│ histogram(2)= 0 0 18 6
│ <--- '2018-07-31' ---- '2018-08-04'
└── scan hist@idx_b
├── columns: b:2(date!null) rowid:8(int!null)
├── constraint: /2/8: [/'2018-08-01' - /'2018-08-04']
├── stats: [rows=24, distinct(2)=3.95478935, null(2)=0, distinct(8)=24, null(8)=0]
├── stats: [rows=24, distinct(2)=3.9, null(2)=0, distinct(8)=24, null(8)=0]
│ histogram(2)= 0 0 18 6
│ <--- '2018-07-31' ---- '2018-08-04'
├── key: (8)
Expand All @@ -764,7 +764,7 @@ index-join hist
└── scan hist@idx_c
├── columns: c:3(decimal!null) rowid:8(int!null)
├── constraint: /3/8: (/NULL - /10) [/20 - /20]
├── stats: [rows=110, distinct(3)=5.1, null(3)=0, distinct(8)=110, null(8)=0]
├── stats: [rows=110, distinct(3)=10, null(3)=0, distinct(8)=110, null(8)=0]
│ histogram(3)= 0 0 90 0 0 20
│ <--- 0 ---- 10 --- 20
├── key: (8)
Expand All @@ -779,7 +779,7 @@ index-join hist
└── scan hist@idx_c
├── columns: c:3(decimal!null) rowid:8(int!null)
├── constraint: /3/8: (/NULL - /10] [/20 - /20]
├── stats: [rows=120, distinct(3)=6.1, null(3)=0, distinct(8)=120, null(8)=0]
├── stats: [rows=120, distinct(3)=11, null(3)=0, distinct(8)=120, null(8)=0]
│ histogram(3)= 0 0 90 10 0 20
│ <--- 0 ---- 10 --- 20
├── key: (8)
Expand All @@ -793,13 +793,13 @@ index-join hist
├── stats: [rows=333.333333]
└── select
├── columns: d:4(float!null) rowid:8(int!null)
├── stats: [rows=61.6666667, distinct(4)=8.14917966, null(4)=0, distinct(8)=61.6666667, null(8)=0]
├── stats: [rows=61.6666667, distinct(4)=11.4830985, null(4)=0, distinct(8)=61.6666667, null(8)=0]
├── key: (8)
├── fd: (8)-->(4)
├── scan hist@idx_d
│ ├── columns: d:4(float!null) rowid:8(int!null)
│ ├── constraint: /4/8: [/5.0 - /14.999999999999998] [/40.0 - ]
│ ├── stats: [rows=185, distinct(4)=8.15, null(4)=0, distinct(8)=185, null(8)=0]
│ ├── stats: [rows=185, distinct(4)=11.5, null(4)=0, distinct(8)=185, null(8)=0]
│ │ histogram(4)= 0 0 45 10 90 0 0 40
│ │ <--- 4.999999999999999 ---- 10.0 ---- 14.999999999999998 --- 40.0
│ ├── key: (8)
Expand All @@ -812,13 +812,13 @@ SELECT * FROM hist WHERE e < '2018-07-31 23:00:00'::TIMESTAMP
----
index-join hist
├── columns: a:1(int) b:2(date) c:3(decimal) d:4(float) e:5(timestamp!null) f:6(timestamptz) g:7(string)
├── stats: [rows=105.564516, distinct(5)=21.811828, null(5)=0]
├── stats: [rows=105.564516, distinct(5)=51.5147849, null(5)=0]
│ histogram(5)= 0 0 90 10 5.5645 0
│ <--- '2018-06-30 00:00:00+00:00' ---- '2018-07-31 00:00:00+00:00' -------- '2018-07-31 22:59:59.999999+00:00'
└── scan hist@idx_e
├── columns: e:5(timestamp!null) rowid:8(int!null)
├── constraint: /5/8: (/NULL - /'2018-07-31 22:59:59.999999+00:00']
├── stats: [rows=105.564516, distinct(5)=21.811828, null(5)=0, distinct(8)=105.564516, null(8)=0]
├── stats: [rows=105.564516, distinct(5)=51.5147849, null(5)=0, distinct(8)=105.564516, null(8)=0]
│ histogram(5)= 0 0 90 10 5.5645 0
│ <--- '2018-06-30 00:00:00+00:00' ---- '2018-07-31 00:00:00+00:00' -------- '2018-07-31 22:59:59.999999+00:00'
├── key: (8)
Expand All @@ -830,15 +830,13 @@ SELECT * FROM hist WHERE f = '2019-10-30 23:00:00'::TIMESTAMPTZ
index-join hist
├── columns: a:1(int) b:2(date) c:3(decimal) d:4(float) e:5(timestamp) f:6(timestamptz!null) g:7(string)
├── stats: [rows=1, distinct(6)=1, null(6)=0]
│ histogram(6)= 0 0
│ <--- '2018-10-31 00:00:00+00:00'
│ histogram(6)=
├── fd: ()-->(6)
└── scan hist@idx_f
├── columns: f:6(timestamptz!null) rowid:8(int!null)
├── constraint: /6/8: [/'2019-10-30 23:00:00+00:00' - /'2019-10-30 23:00:00+00:00']
├── stats: [rows=1, distinct(6)=1, null(6)=0, distinct(8)=1, null(8)=0]
│ histogram(6)= 0 0
│ <--- '2018-10-31 00:00:00+00:00'
│ histogram(6)=
├── key: (8)
└── fd: ()-->(6)

Expand Down
Loading