Merge #39178

39178: stats,opt: performance improvements for histograms r=rytaft a=rytaft **stats,opt: move histogram bucket distinct count estimation code** This commit moves the code to estimate the number of distinct values per histogram bucket out of the optimizer and into the stats package. The purpose of doing this is to remove the overhead of calculating the number of distinct values per bucket from the critical path of query planning, and instead perform the calculation offline when initially generating the histogram. **opt: performance improvements for histograms** This commit improves the performance of histograms in the optimizer by avoiding allocating and copying the full histogram unless strictly necessary. Additionally, it changes the code for filtering histograms to use binary search instead of performing a linear scan. ---- As a result of both of these commits, the overhead of histograms is reduced. Prior to these changes, the overhead of histograms caused 14% lower throughput and 29% higher latency when running kv with 95% reads. After, it caused only 4.6% lower throughput and 11% higher latency. (Clearly there is still work to do, but this is some progress...) Co-authored-by: Rebecca Taft <becca@cockroachlabs.com>
cockroachdb · Aug 1, 2019 · 05b9ca0 · 05b9ca0
2 parents 9e67a9c + 3243046
commit 05b9ca0
Show file tree

Hide file tree

Showing 20 changed files with 23,274 additions and 16,007 deletions.
diff --git a/pkg/sql/distsqlrun/sample_aggregator.go b/pkg/sql/distsqlrun/sample_aggregator.go
@@ -294,6 +294,7 @@ func (s *sampleAggregator) writeResults(ctx context.Context) error {
 	// closure.
 	if err := s.flowCtx.ClientDB.Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
 		for _, si := range s.sketches {
+			distinctCount := int64(si.sketch.Estimate())
 			var histogram *stats.HistogramData
 			if si.spec.GenerateHistogram && len(s.sr.Get()) != 0 {
 				colIdx := int(si.spec.Columns[0])
@@ -305,6 +306,7 @@ func (s *sampleAggregator) writeResults(ctx context.Context) error {
 					colIdx,
 					typ,
 					si.numRows,
+					distinctCount,
 					int(si.spec.HistogramMaxBuckets),
 				)
 				if err != nil {
@@ -338,7 +340,7 @@ func (s *sampleAggregator) writeResults(ctx context.Context) error {
 				si.spec.StatName,
 				columnIDs,
 				si.numRows,
-				int64(si.sketch.Estimate()),
+				distinctCount,
 				si.numNulls,
 				histogram,
 			); err != nil {
@@ -364,6 +366,7 @@ func generateHistogram(
 	colIdx int,
 	colType *types.T,
 	numRows int64,
+	distinctCount int64,
 	maxBuckets int,
 ) (stats.HistogramData, error) {
 	var da sqlbase.DatumAlloc
@@ -378,5 +381,5 @@ func generateHistogram(
 			values = append(values, ed.Datum)
 		}
 	}
-	return stats.EquiDepthHistogram(evalCtx, values, numRows, maxBuckets)
+	return stats.EquiDepthHistogram(evalCtx, values, numRows, distinctCount, maxBuckets)
 }
diff --git a/pkg/sql/logictest/testdata/logic_test/distsql_stats b/pkg/sql/logictest/testdata/logic_test/distsql_stats
@@ -76,20 +76,20 @@ s1               {a}           10000      10              0           true
 let $hist_id_1
 SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE data] WHERE statistics_name = 's1'
 
-query TII colnames
+query TIRI colnames
 SHOW HISTOGRAM $hist_id_1
 ----
-upper_bound  range_rows  equal_rows
-1            0           1000
-2            0           1000
-3            0           1000
-4            0           1000
-5            0           1000
-6            0           1000
-7            0           1000
-8            0           1000
-9            0           1000
-10           0           1000
+upper_bound  range_rows  distinct_range_rows  equal_rows
+1            0           0                    1000
+2            0           0                    1000
+3            0           0                    1000
+4            0           0                    1000
+5            0           0                    1000
+6            0           0                    1000
+7            0           0                    1000
+8            0           0                    1000
+9            0           0                    1000
+10           0           0                    1000
 
 statement ok
 CREATE STATISTICS "" ON b FROM data

diff --git a/pkg/sql/opt/cat/table.go b/pkg/sql/opt/cat/table.go
@@ -182,19 +182,22 @@ type TableStatistic interface {
 	Histogram() []HistogramBucket
 }
 
-// HistogramBucket contains the data for a single histogram bucket.
+// HistogramBucket contains the data for a single histogram bucket. Note
+// that NumEq, NumRange, and DistinctRange are floats so the statisticsBuilder
+// can apply filters to the histogram.
 type HistogramBucket struct {
 	// NumEq is the estimated number of values equal to UpperBound.
-	NumEq uint64
+	NumEq float64
 
-	// NumRange is the estimated number of values between the lower bound of the
-	// bucket and UpperBound (both boundaries are exclusive).
-	//
-	// The lower bound is inferred based on the location of this bucket in a
-	// slice of buckets. If it is the first bucket, the lower bound is the minimum
-	// possible value for the given data type. Otherwise, the lower bound is equal
-	// to the upper bound of the previous bucket.
-	NumRange uint64
+	// NumRange is the estimated number of values between the upper bound of the
+	// previous bucket and UpperBound (both boundaries are exclusive).
+	// The first bucket should always have NumRange=0.
+	NumRange float64
+
+	// DistinctRange is the estimated number of distinct values between the upper
+	// bound of the previous bucket and UpperBound (both boundaries are
+	// exclusive).
+	DistinctRange float64
 
 	// UpperBound is the upper bound of the bucket.
 	UpperBound tree.Datum

diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -474,7 +474,7 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
 				if cols.Len() == 1 && stat.Histogram() != nil {
 					col, _ := cols.Next(0)
 					colStat.Histogram = &props.Histogram{}
-					colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram(), colStat.DistinctCount)
+					colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram())
 				}
 
 				// Make sure the distinct count is at least 1, for the same reason as
@@ -554,23 +554,13 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *pro
 
 	inputColStat := sb.colStatTable(scan.Table, colSet)
 	colStat := sb.copyColStat(colSet, s, inputColStat)
-	if inputColStat.Histogram != nil {
-		colStat.Histogram = inputColStat.Histogram.Copy()
-	}
+	colStat.Histogram = inputColStat.Histogram
 
 	if s.Selectivity != 1 {
 		tableStats := sb.makeTableStatistics(scan.Table)
 		colStat.ApplySelectivity(s.Selectivity, tableStats.RowCount)
 	}
 
-	// Cap distinct and null counts at limit, if it exists.
-	if scan.HardLimit.IsSet() {
-		if limit := float64(scan.HardLimit.RowCount()); limit < s.RowCount {
-			colStat.DistinctCount = min(colStat.DistinctCount, limit)
-			colStat.NullCount = min(colStat.NullCount, limit)
-		}
-	}
-
 	if colSet.SubsetOf(relProps.NotNullCols) {
 		colStat.NullCount = 0
 	}
@@ -2215,7 +2205,7 @@ func (sb *statisticsBuilder) finalizeFromRowCount(
 	if colStat.Histogram != nil {
 		valuesCount := colStat.Histogram.ValuesCount()
 		if valuesCount > rowCount {
-			colStat.Histogram.ApplySelectivity(rowCount / valuesCount)
+			colStat.Histogram = colStat.Histogram.ApplySelectivity(rowCount / valuesCount)
 		}
 	}
 }

diff --git a/pkg/sql/opt/memo/testdata/stats/scan b/pkg/sql/opt/memo/testdata/stats/scan
@@ -609,11 +609,11 @@ ALTER TABLE hist INJECT STATISTICS '[
     "distinct_count": 40,
     "histo_col_type": "int",
     "histo_buckets": [
-      {"num_eq": 0, "num_range": 0, "upper_bound": "0"},
-      {"num_eq": 10, "num_range": 90, "upper_bound": "10"},
-      {"num_eq": 20, "num_range": 180, "upper_bound": "20"},
-      {"num_eq": 30, "num_range": 270, "upper_bound": "30"},
-      {"num_eq": 40, "num_range": 360, "upper_bound": "40"}
+      {"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "0"},
+      {"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "10"},
+      {"num_eq": 20, "num_range": 180, "distinct_range": 9, "upper_bound": "20"},
+      {"num_eq": 30, "num_range": 270, "distinct_range": 9, "upper_bound": "30"},
+      {"num_eq": 40, "num_range": 360, "distinct_range": 9, "upper_bound": "40"}
     ]
   },
   {
@@ -623,11 +623,11 @@ ALTER TABLE hist INJECT STATISTICS '[
     "distinct_count": 120,
     "histo_col_type": "date",
     "histo_buckets": [
-      {"num_eq": 0, "num_range": 0, "upper_bound": "2018-06-30"},
-      {"num_eq": 10, "num_range": 90, "upper_bound": "2018-07-31"},
-      {"num_eq": 20, "num_range": 180, "upper_bound": "2018-08-31"},
-      {"num_eq": 30, "num_range": 270, "upper_bound": "2018-09-30"},
-      {"num_eq": 40, "num_range": 360, "upper_bound": "2018-10-31"}
+      {"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "2018-06-30"},
+      {"num_eq": 10, "num_range": 90, "distinct_range": 29, "upper_bound": "2018-07-31"},
+      {"num_eq": 20, "num_range": 180, "distinct_range": 29, "upper_bound": "2018-08-31"},
+      {"num_eq": 30, "num_range": 270, "distinct_range": 29, "upper_bound": "2018-09-30"},
+      {"num_eq": 40, "num_range": 360, "distinct_range": 29, "upper_bound": "2018-10-31"}
     ]
   },
   {
@@ -637,11 +637,11 @@ ALTER TABLE hist INJECT STATISTICS '[
     "distinct_count": 45,
     "histo_col_type": "decimal",
     "histo_buckets": [
-      {"num_eq": 0, "num_range": 0, "upper_bound": "0"},
-      {"num_eq": 10, "num_range": 90, "upper_bound": "10"},
-      {"num_eq": 20, "num_range": 180, "upper_bound": "20"},
-      {"num_eq": 30, "num_range": 270, "upper_bound": "30"},
-      {"num_eq": 40, "num_range": 360, "upper_bound": "40"}
+      {"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "0"},
+      {"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "10"},
+      {"num_eq": 20, "num_range": 180, "distinct_range": 10, "upper_bound": "20"},
+      {"num_eq": 30, "num_range": 270, "distinct_range": 11, "upper_bound": "30"},
+      {"num_eq": 40, "num_range": 360, "distinct_range": 11, "upper_bound": "40"}
     ]
   },
   {
@@ -651,11 +651,11 @@ ALTER TABLE hist INJECT STATISTICS '[
     "distinct_count": 45,
     "histo_col_type": "float",
     "histo_buckets": [
-      {"num_eq": 0, "num_range": 0, "upper_bound": "0"},
-      {"num_eq": 10, "num_range": 90, "upper_bound": "10"},
-      {"num_eq": 20, "num_range": 180, "upper_bound": "20"},
-      {"num_eq": 30, "num_range": 270, "upper_bound": "30"},
-      {"num_eq": 40, "num_range": 360, "upper_bound": "40"}
+      {"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "0"},
+      {"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "10"},
+      {"num_eq": 20, "num_range": 180, "distinct_range": 10, "upper_bound": "20"},
+      {"num_eq": 30, "num_range": 270, "distinct_range": 11, "upper_bound": "30"},
+      {"num_eq": 40, "num_range": 360, "distinct_range": 11, "upper_bound": "40"}
     ]
   },
   {
@@ -665,11 +665,11 @@ ALTER TABLE hist INJECT STATISTICS '[
     "distinct_count": 200,
     "histo_col_type": "timestamp",
     "histo_buckets": [
-      {"num_eq": 0, "num_range": 0, "upper_bound": "2018-06-30"},
-      {"num_eq": 10, "num_range": 90, "upper_bound": "2018-07-31"},
-      {"num_eq": 20, "num_range": 180, "upper_bound": "2018-08-31"},
-      {"num_eq": 30, "num_range": 270, "upper_bound": "2018-09-30"},
-      {"num_eq": 40, "num_range": 360, "upper_bound": "2018-10-31"}
+      {"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "2018-06-30"},
+      {"num_eq": 10, "num_range": 90, "distinct_range": 49, "upper_bound": "2018-07-31"},
+      {"num_eq": 20, "num_range": 180, "distinct_range": 49, "upper_bound": "2018-08-31"},
+      {"num_eq": 30, "num_range": 270, "distinct_range": 49, "upper_bound": "2018-09-30"},
+      {"num_eq": 40, "num_range": 360, "distinct_range": 49, "upper_bound": "2018-10-31"}
     ]
   },
   {
@@ -679,11 +679,11 @@ ALTER TABLE hist INJECT STATISTICS '[
     "distinct_count": 200,
     "histo_col_type": "timestamptz",
     "histo_buckets": [
-      {"num_eq": 0, "num_range": 0, "upper_bound": "2018-06-30"},
-      {"num_eq": 10, "num_range": 90, "upper_bound": "2018-07-31"},
-      {"num_eq": 20, "num_range": 180, "upper_bound": "2018-08-31"},
-      {"num_eq": 30, "num_range": 270, "upper_bound": "2018-09-30"},
-      {"num_eq": 40, "num_range": 360, "upper_bound": "2018-10-31"}
+      {"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "2018-06-30"},
+      {"num_eq": 10, "num_range": 90, "distinct_range": 49, "upper_bound": "2018-07-31"},
+      {"num_eq": 20, "num_range": 180, "distinct_range": 49, "upper_bound": "2018-08-31"},
+      {"num_eq": 30, "num_range": 270, "distinct_range": 49, "upper_bound": "2018-09-30"},
+      {"num_eq": 40, "num_range": 360, "distinct_range": 49, "upper_bound": "2018-10-31"}
     ]
   },
   {
@@ -693,11 +693,11 @@ ALTER TABLE hist INJECT STATISTICS '[
     "distinct_count": 40,
     "histo_col_type": "string",
     "histo_buckets": [
-      {"num_eq": 0, "num_range": 0, "upper_bound": "apple"},
-      {"num_eq": 10, "num_range": 90, "upper_bound": "banana"},
-      {"num_eq": 20, "num_range": 180, "upper_bound": "cherry"},
-      {"num_eq": 30, "num_range": 270, "upper_bound": "mango"},
-      {"num_eq": 40, "num_range": 360, "upper_bound": "pineapple"}
+      {"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "apple"},
+      {"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "banana"},
+      {"num_eq": 20, "num_range": 180, "distinct_range": 9, "upper_bound": "cherry"},
+      {"num_eq": 30, "num_range": 270, "distinct_range": 9, "upper_bound": "mango"},
+      {"num_eq": 40, "num_range": 360, "distinct_range": 9, "upper_bound": "pineapple"}
     ]
   }
 ]'
@@ -709,13 +709,13 @@ SELECT * FROM hist WHERE a < 10
 ----
 index-join hist
  ├── columns: a:1(int!null) b:2(date) c:3(decimal) d:4(float) e:5(timestamp) f:6(timestamptz) g:7(string)
- ├── stats: [rows=90, distinct(1)=8.99985057, null(1)=0]
+ ├── stats: [rows=90, distinct(1)=9, null(1)=0]
  │   histogram(1)=  0  0  80 10
  │                <--- 0 ---- 9
  └── scan hist@idx_a
       ├── columns: a:1(int!null) rowid:8(int!null)
       ├── constraint: /1/8: (/NULL - /9]
-      ├── stats: [rows=90, distinct(1)=8.99985057, null(1)=0, distinct(8)=90, null(8)=0]
+      ├── stats: [rows=90, distinct(1)=9, null(1)=0, distinct(8)=90, null(8)=0]
       │   histogram(1)=  0  0  80 10
       │                <--- 0 ---- 9
       ├── key: (8)
@@ -743,13 +743,13 @@ SELECT * FROM hist WHERE b > '2018-07-31'::DATE AND b < '2018-08-05'::DATE
 ----
 index-join hist
  ├── columns: a:1(int) b:2(date!null) c:3(decimal) d:4(float) e:5(timestamp) f:6(timestamptz) g:7(string)
- ├── stats: [rows=24, distinct(2)=3.95478935, null(2)=0]
+ ├── stats: [rows=24, distinct(2)=3.9, null(2)=0]
  │   histogram(2)=  0       0        18       6
  │                <--- '2018-07-31' ---- '2018-08-04'
  └── scan hist@idx_b
       ├── columns: b:2(date!null) rowid:8(int!null)
       ├── constraint: /2/8: [/'2018-08-01' - /'2018-08-04']
-      ├── stats: [rows=24, distinct(2)=3.95478935, null(2)=0, distinct(8)=24, null(8)=0]
+      ├── stats: [rows=24, distinct(2)=3.9, null(2)=0, distinct(8)=24, null(8)=0]
       │   histogram(2)=  0       0        18       6
       │                <--- '2018-07-31' ---- '2018-08-04'
       ├── key: (8)
@@ -764,7 +764,7 @@ index-join hist
  └── scan hist@idx_c
       ├── columns: c:3(decimal!null) rowid:8(int!null)
       ├── constraint: /3/8: (/NULL - /10) [/20 - /20]
-      ├── stats: [rows=110, distinct(3)=5.1, null(3)=0, distinct(8)=110, null(8)=0]
+      ├── stats: [rows=110, distinct(3)=10, null(3)=0, distinct(8)=110, null(8)=0]
       │   histogram(3)=  0  0  90  0   0  20
       │                <--- 0 ---- 10 --- 20
       ├── key: (8)
@@ -779,7 +779,7 @@ index-join hist
  └── scan hist@idx_c
       ├── columns: c:3(decimal!null) rowid:8(int!null)
       ├── constraint: /3/8: (/NULL - /10] [/20 - /20]
-      ├── stats: [rows=120, distinct(3)=6.1, null(3)=0, distinct(8)=120, null(8)=0]
+      ├── stats: [rows=120, distinct(3)=11, null(3)=0, distinct(8)=120, null(8)=0]
       │   histogram(3)=  0  0  90  10  0  20
       │                <--- 0 ---- 10 --- 20
       ├── key: (8)
@@ -793,13 +793,13 @@ index-join hist
  ├── stats: [rows=333.333333]
  └── select
       ├── columns: d:4(float!null) rowid:8(int!null)
-      ├── stats: [rows=61.6666667, distinct(4)=8.14917966, null(4)=0, distinct(8)=61.6666667, null(8)=0]
+      ├── stats: [rows=61.6666667, distinct(4)=11.4830985, null(4)=0, distinct(8)=61.6666667, null(8)=0]
       ├── key: (8)
       ├── fd: (8)-->(4)
       ├── scan hist@idx_d
       │    ├── columns: d:4(float!null) rowid:8(int!null)
       │    ├── constraint: /4/8: [/5.0 - /14.999999999999998] [/40.0 - ]
-      │    ├── stats: [rows=185, distinct(4)=8.15, null(4)=0, distinct(8)=185, null(8)=0]
+      │    ├── stats: [rows=185, distinct(4)=11.5, null(4)=0, distinct(8)=185, null(8)=0]
       │    │   histogram(4)=  0          0          45   10   90          0           0   40
       │    │                <--- 4.999999999999999 ---- 10.0 ---- 14.999999999999998 --- 40.0
       │    ├── key: (8)
@@ -812,13 +812,13 @@ SELECT * FROM hist WHERE e < '2018-07-31 23:00:00'::TIMESTAMP
 ----
 index-join hist
  ├── columns: a:1(int) b:2(date) c:3(decimal) d:4(float) e:5(timestamp!null) f:6(timestamptz) g:7(string)
- ├── stats: [rows=105.564516, distinct(5)=21.811828, null(5)=0]
+ ├── stats: [rows=105.564516, distinct(5)=51.5147849, null(5)=0]
  │   histogram(5)=  0               0               90              10               5.5645                  0
  │                <--- '2018-06-30 00:00:00+00:00' ---- '2018-07-31 00:00:00+00:00' -------- '2018-07-31 22:59:59.999999+00:00'
  └── scan hist@idx_e
       ├── columns: e:5(timestamp!null) rowid:8(int!null)
       ├── constraint: /5/8: (/NULL - /'2018-07-31 22:59:59.999999+00:00']
-      ├── stats: [rows=105.564516, distinct(5)=21.811828, null(5)=0, distinct(8)=105.564516, null(8)=0]
+      ├── stats: [rows=105.564516, distinct(5)=51.5147849, null(5)=0, distinct(8)=105.564516, null(8)=0]
       │   histogram(5)=  0               0               90              10               5.5645                  0
       │                <--- '2018-06-30 00:00:00+00:00' ---- '2018-07-31 00:00:00+00:00' -------- '2018-07-31 22:59:59.999999+00:00'
       ├── key: (8)
@@ -830,15 +830,13 @@ SELECT * FROM hist WHERE f = '2019-10-30 23:00:00'::TIMESTAMPTZ
 index-join hist
  ├── columns: a:1(int) b:2(date) c:3(decimal) d:4(float) e:5(timestamp) f:6(timestamptz!null) g:7(string)
  ├── stats: [rows=1, distinct(6)=1, null(6)=0]
- │   histogram(6)=  0               0
- │                <--- '2018-10-31 00:00:00+00:00'
+ │   histogram(6)=
  ├── fd: ()-->(6)
  └── scan hist@idx_f
       ├── columns: f:6(timestamptz!null) rowid:8(int!null)
       ├── constraint: /6/8: [/'2019-10-30 23:00:00+00:00' - /'2019-10-30 23:00:00+00:00']
       ├── stats: [rows=1, distinct(6)=1, null(6)=0, distinct(8)=1, null(8)=0]
-      │   histogram(6)=  0               0
-      │                <--- '2018-10-31 00:00:00+00:00'
+      │   histogram(6)=
       ├── key: (8)
       └── fd: ()-->(6)