Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
43547: opt: add exploration rules for min/max => limit 1 transform r=andy-kimball a=andy-kimball

Add new rules that work similarly to the existing ReplaceMaxWithLimit and
ReplaceMinWithLimit rules, except that they recognize the non-Scalar GroupBy
case. The new rules add an ordered LIMIT 1 to the input when it can be proven
that there is at most one group, and there exists a single min/max aggregate
function operating on that group. In that case, the GROUP BY can be entirely
replaced by a PROJECT, similar to this:

  SELECT min(k) FROM kw WHERE w = 5 GROUP BY w
  =>
  SELECT k FROM kw@w WHERE w = 5

Note that the Max and Min versions of this rule are not fully symmetric.
This is because NULL values sort first in CRDB. This can interfere with the
calculation of the Min function, because NULL values need to be ignored unless
the group contains only NULL values (in which case the function returns NULL).
Therefore, this rule only works when the MIN column is NOT NULL, as only in
that case is one input row always sufficient to calculate MIN.

Release note (sql change): Add optimization to scan over only 1 row when
finding the MIN/MAX of a single aggregate group, as long as the correct
index is present.

Co-authored-by: Andrew Kimball <andyk@cockroachlabs.com>
  • Loading branch information
craig[bot] and andy-kimball committed Jan 19, 2020
2 parents 4e2c406 + 6c57f77 commit 3bb1834
Show file tree
Hide file tree
Showing 21 changed files with 806 additions and 182 deletions.
68 changes: 62 additions & 6 deletions pkg/sql/logictest/testdata/logic_test/aggregate
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,15 @@ SELECT stddev(x) FROM xyz WHERE x = 1
----
NULL

# Ensure subqueries don't trigger aggregation.
query B
SELECT x > (SELECT avg(0)) FROM xyz LIMIT 1
----
true

statement ok
DROP TABLE xyz

# Numerical stability test for VARIANCE/STDDEV.
# See https://www.johndcook.com/blog/2008/09/28/theoretical-explanation-for-numerical-results.
# Avoid using random() since we do not have the deterministic option to specify a pseudo-random seed yet.
Expand Down Expand Up @@ -854,12 +863,6 @@ SELECT stddev(1::int), stddev(1::float), stddev(1::decimal)
----
NULL NULL NULL

# Ensure subqueries don't trigger aggregation.
query B
SELECT x > (SELECT avg(0)) FROM xyz LIMIT 1
----
true

statement ok
CREATE TABLE bits (b INT)

Expand Down Expand Up @@ -1978,3 +1981,56 @@ SELECT a, b, count(*) FROM ab RIGHT JOIN tab ON b=col2 GROUP BY a
----
NULL NULL 4
4 7 2

# Additional tests for MIN/MAX aggregates with indexes.
statement ok
CREATE TABLE xyz (
x INT PRIMARY KEY,
y INT,
z INT,
INDEX yz (y, z)
)

statement ok
INSERT INTO xyz VALUES (1, 2, 3), (2, 2, 7), (3, 2, 1), (4, 2, NULL), (5, 3, -1)

query I
SELECT min(z) FROM xyz WHERE y = 2 GROUP BY y
----
1

query I
SELECT min(z) FROM xyz WHERE y = 2 AND z IS NOT NULL GROUP BY y
----
1

query I
SELECT min(z) FROM xyz WHERE y = 2 AND z IS NULL GROUP BY y
----
NULL

query I
SELECT min(z) FROM xyz WHERE y = 100 AND z IS NULL GROUP BY y
----

query I
SELECT max(z) FROM xyz WHERE y = 2 GROUP BY y
----
7

query I
SELECT max(z) FROM xyz WHERE y = 2 AND z IS NOT NULL GROUP BY y
----
7

query I
SELECT max(z) FROM xyz WHERE y = 2 AND z IS NULL GROUP BY y
----
NULL

query I
SELECT max(z) FROM xyz WHERE y = 100 GROUP BY y
----

statement ok
DROP TABLE xyz
2 changes: 1 addition & 1 deletion pkg/sql/opt/exec/execbuilder/testdata/distsql_agg
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ group-by
├── grouping columns: b:2
├── internal-ordering: +2 opt(1)
├── stats: [rows=9.5617925, distinct(2)=9.5617925, null(2)=0]
├── cost: 11.1156179
├── cost: 11.1256179
├── key: (2)
├── fd: (2)-->(3)
├── prune: (3)
Expand Down
12 changes: 10 additions & 2 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -1478,8 +1478,16 @@ func (sb *statisticsBuilder) buildGroupBy(groupNode RelExpr, relProps *props.Rel
groupingColSet := groupNode.Private().(*GroupingPrivate).GroupingCols

if groupingColSet.Empty() {
// ScalarGroupBy or GroupBy with empty grouping columns.
s.RowCount = 1
if groupNode.Op() == opt.ScalarGroupByOp {
// ScalarGroupBy always returns exactly one row.
s.RowCount = 1
} else {
// GroupBy with empty grouping columns returns 0 or 1 rows, depending
// on whether input has rows. If input has < 1 row, use that, as that
// represents the probability of having 0 vs. 1 rows.
inputStats := sb.statsFromChild(groupNode, 0 /* childIdx */)
s.RowCount = min(1, inputStats.RowCount)
}
} else {
// Estimate the row count based on the distinct count of the grouping
// columns.
Expand Down
18 changes: 9 additions & 9 deletions pkg/sql/opt/memo/testdata/format
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,23 @@ SELECT a + 1, min(b) FROM t WHERE k + a > b GROUP BY a ORDER BY a
sort
├── columns: "?column?":5(int) min:4(int) [hidden: t.public.t.a:1(int)]
├── stats: [rows=98.1771622]
├── cost: 1097.86224
├── cost: 1097.87224
├── key: (1)
├── fd: (1)-->(4,5)
├── ordering: +1
├── prune: (1,4,5)
└── project
├── columns: "?column?":5(int) t.public.t.a:1(int) min:4(int)
├── stats: [rows=98.1771622]
├── cost: 1082.89531
├── cost: 1082.90531
├── key: (1)
├── fd: (1)-->(4,5)
├── prune: (1,4,5)
├── group-by
│ ├── columns: t.public.t.a:1(int) min:4(int)
│ ├── grouping columns: t.public.t.a:1(int)
│ ├── stats: [rows=98.1771622, distinct(1)=98.1771622, null(1)=3.3]
│ ├── cost: 1080.92177
│ ├── cost: 1080.93177
│ ├── key: (1)
│ ├── fd: (1)-->(4)
│ ├── prune: (4)
Expand Down Expand Up @@ -63,17 +63,17 @@ SELECT a + 1, min(b) FROM t WHERE k + a > b GROUP BY a ORDER BY a
sort
├── columns: "?column?":5(int) min:4(int) [hidden: t.public.t.a:1(int)]
├── stats: [rows=98.1771622]
├── cost: 1097.86224
├── cost: 1097.87224
├── ordering: +1
└── project
├── columns: "?column?":5(int) t.public.t.a:1(int) min:4(int)
├── stats: [rows=98.1771622]
├── cost: 1082.89531
├── cost: 1082.90531
├── group-by
│ ├── columns: t.public.t.a:1(int) min:4(int)
│ ├── grouping columns: t.public.t.a:1(int)
│ ├── stats: [rows=98.1771622, distinct(1)=98.1771622, null(1)=3.3]
│ ├── cost: 1080.92177
│ ├── cost: 1080.93177
│ ├── select
│ │ ├── columns: t.public.t.a:1(int) t.public.t.b:2(int!null) t.public.t.k:3(int!null)
│ │ ├── stats: [rows=330, distinct(1)=98.1771622, null(1)=3.3, distinct(2)=100, null(2)=0]
Expand Down Expand Up @@ -179,19 +179,19 @@ SELECT a + 1, min(b) FROM t WHERE k + a > b GROUP BY a ORDER BY a
----
sort
├── stats: [rows=98.1771622]
├── cost: 1097.86224
├── cost: 1097.87224
├── key: (1)
├── fd: (1)-->(4,5)
├── prune: (1,4,5)
└── project
├── stats: [rows=98.1771622]
├── cost: 1082.89531
├── cost: 1082.90531
├── key: (1)
├── fd: (1)-->(4,5)
├── prune: (1,4,5)
├── group-by
│ ├── stats: [rows=98.1771622, distinct(1)=98.1771622, null(1)=3.3]
│ ├── cost: 1080.92177
│ ├── cost: 1080.93177
│ ├── key: (1)
│ ├── fd: (1)-->(4)
│ ├── prune: (4)
Expand Down
12 changes: 6 additions & 6 deletions pkg/sql/opt/memo/testdata/memo
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ memo (optimized, ~3KB, required=[presentation: array_agg:3])
├── G1: (scalar-group-by G2 G3 cols=())
│ └── [presentation: array_agg:3]
│ ├── best: (scalar-group-by G2 G3 cols=())
│ └── cost: 1040.04
│ └── cost: 1040.05
├── G2: (scan a,cols=(1))
│ └── []
│ ├── best: (scan a,cols=(1))
Expand All @@ -309,11 +309,11 @@ memo (optimized, ~3KB, required=[presentation: array_agg:3])
├── G1: (project G2 G3 array_agg)
│ └── [presentation: array_agg:3]
│ ├── best: (project G2 G3 array_agg)
│ └── cost: 1072.04
│ └── cost: 1072.05
├── G2: (group-by G4 G5 cols=(2))
│ └── []
│ ├── best: (group-by G4 G5 cols=(2))
│ └── cost: 1071.03
│ └── cost: 1071.04
├── G3: (projections)
├── G4: (scan a)
│ └── []
Expand All @@ -330,7 +330,7 @@ memo (optimized, ~2KB, required=[presentation: array_agg:3])
├── G1: (scalar-group-by G2 G3 cols=(),ordering=+2)
│ └── [presentation: array_agg:3]
│ ├── best: (scalar-group-by G2="[ordering: +2]" G3 cols=(),ordering=+2)
│ └── cost: 1269.37
│ └── cost: 1269.38
├── G2: (scan a)
│ ├── [ordering: +2]
│ │ ├── best: (sort G2)
Expand All @@ -349,7 +349,7 @@ memo (optimized, ~9KB, required=[presentation: field:6])
├── G1: (distinct-on G2 G3 cols=(6))
│ └── [presentation: field:6]
│ ├── best: (distinct-on G2 G3 cols=(6))
│ └── cost: 0.46
│ └── cost: 0.47
├── G2: (project G4 G5)
│ └── []
│ ├── best: (project G4 G5)
Expand Down Expand Up @@ -377,7 +377,7 @@ memo (optimized, ~6KB, required=[presentation: tag:11])
├── G1: (distinct-on G2 G3 cols=(11))
│ └── [presentation: tag:11]
│ ├── best: (distinct-on G2 G3 cols=(11))
│ └── cost: 0.44
│ └── cost: 0.45
├── G2: (project G4 G5)
│ └── []
│ ├── best: (project G4 G5)
Expand Down
34 changes: 17 additions & 17 deletions pkg/sql/opt/memo/testdata/stats_quality/tpcc
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ scalar-group-by
├── columns: sum:11(decimal)
├── cardinality: [1 - 1]
├── stats: [rows=1, distinct(11)=1, null(11)=0]
├── cost: 11.4924782
├── cost: 11.5024782
├── key: ()
├── fd: ()-->(11)
├── prune: (11)
Expand Down Expand Up @@ -587,7 +587,7 @@ scalar-group-by
├── columns: count:28(int)
├── cardinality: [1 - 1]
├── stats: [rows=1, distinct(28)=1, null(28)=0]
├── cost: 1366.93732
├── cost: 1366.94732
├── key: ()
├── fd: ()-->(28)
├── prune: (28)
Expand Down Expand Up @@ -688,7 +688,7 @@ scalar-group-by
├── columns: count:22(int)
├── cardinality: [1 - 1]
├── stats: [rows=1, distinct(22)=1, null(22)=0]
├── cost: 126.623333
├── cost: 126.643333
├── key: ()
├── fd: ()-->(22)
├── prune: (22)
Expand All @@ -698,7 +698,7 @@ scalar-group-by
│ ├── left ordering: +1
│ ├── right ordering: +11
│ ├── stats: [rows=3.33333333, distinct(1)=3.33333333, null(1)=0, distinct(9)=1, null(9)=0, distinct(11)=3.33333333, null(11)=0, distinct(21)=3.33333333, null(21)=0]
│ ├── cost: 126.57
│ ├── cost: 126.58
│ ├── key: (11)
│ ├── fd: (1)-->(9), (11)-->(21), (1)==(11), (11)==(1)
│ ├── scan warehouse
Expand All @@ -718,7 +718,7 @@ scalar-group-by
│ │ ├── columns: d_w_id:11(int!null) sum:21(decimal)
│ │ ├── grouping columns: d_w_id:11(int!null)
│ │ ├── stats: [rows=10, distinct(11)=10, null(11)=0, distinct(21)=10, null(21)=0]
│ │ ├── cost: 115.13
│ │ ├── cost: 115.14
│ │ ├── key: (11)
│ │ ├── fd: (11)-->(21)
│ │ ├── ordering: +11
Expand Down Expand Up @@ -805,7 +805,7 @@ group-by
├── columns: max:4(int) [hidden: no_d_id:2(int!null) no_w_id:3(int!null)]
├── grouping columns: no_d_id:2(int!null) no_w_id:3(int!null)
├── stats: [rows=100, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=0, distinct(4)=100, null(4)=0, distinct(2,3)=100, null(2,3)=0]
├── cost: 98101.03
├── cost: 98101.04
├── key: (2,3)
├── fd: (2,3)-->(4)
├── ordering: +3,+2
Expand Down Expand Up @@ -849,7 +849,7 @@ group-by
├── columns: max:9(int) [hidden: o_d_id:2(int!null) o_w_id:3(int!null)]
├── grouping columns: o_d_id:2(int!null) o_w_id:3(int!null)
├── stats: [rows=100, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=0, distinct(9)=100, null(9)=0, distinct(2,3)=100, null(2,3)=0]
├── cost: 336001.03
├── cost: 336001.04
├── key: (2,3)
├── fd: (2,3)-->(9)
├── ordering: +3,+2
Expand Down Expand Up @@ -897,15 +897,15 @@ scalar-group-by
├── columns: count:8(int)
├── cardinality: [1 - 1]
├── stats: [rows=1, distinct(8)=1, null(8)=0]
├── cost: 99902.3933
├── cost: 99902.4133
├── key: ()
├── fd: ()-->(8)
├── prune: (8)
├── select
│ ├── save-table-name: consistency_05_select_2
│ ├── columns: no_d_id:2(int!null) no_w_id:3(int!null) max:4(int) min:5(int) count_rows:6(int)
│ ├── stats: [rows=33.3333333, distinct(2)=9.8265847, null(2)=0, distinct(3)=9.8265847, null(3)=0, distinct(4)=33.3333333, null(4)=0, distinct(5)=33.3333333, null(5)=0, distinct(6)=33.3333333, null(6)=0]
│ ├── cost: 99902.04
│ ├── cost: 99902.05
│ ├── key: (2,3)
│ ├── fd: (2,3)-->(4-6)
│ ├── interesting orderings: (+3,+2)
Expand All @@ -915,7 +915,7 @@ scalar-group-by
│ │ ├── grouping columns: no_d_id:2(int!null) no_w_id:3(int!null)
│ │ ├── internal-ordering: +3,+2
│ │ ├── stats: [rows=100, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=0, distinct(4)=100, null(4)=0, distinct(5)=100, null(5)=0, distinct(6)=100, null(6)=0, distinct(2,3)=100, null(2,3)=0]
│ │ ├── cost: 99901.03
│ │ ├── cost: 99901.04
│ │ ├── key: (2,3)
│ │ ├── fd: (2,3)-->(4-6)
│ │ ├── prune: (4-6)
Expand Down Expand Up @@ -999,7 +999,7 @@ group-by
├── columns: sum:9(decimal) [hidden: o_d_id:2(int!null) o_w_id:3(int!null)]
├── grouping columns: o_d_id:2(int!null) o_w_id:3(int!null)
├── stats: [rows=100, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=0, distinct(9)=100, null(9)=0, distinct(2,3)=100, null(2,3)=0]
├── cost: 342001.03
├── cost: 342001.04
├── key: (2,3)
├── fd: (2,3)-->(9)
├── ordering: +3,+2
Expand Down Expand Up @@ -1041,7 +1041,7 @@ sort
├── save-table-name: consistency_07_sort_1
├── columns: count:11(int) [hidden: ol_d_id:2(int!null) ol_w_id:3(int!null)]
├── stats: [rows=100, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=0, distinct(11)=100, null(11)=0, distinct(2,3)=100, null(2,3)=0]
├── cost: 3361385.73
├── cost: 3361385.74
├── key: (2,3)
├── fd: (2,3)-->(11)
├── ordering: +3,+2
Expand All @@ -1052,7 +1052,7 @@ sort
├── columns: ol_d_id:2(int!null) ol_w_id:3(int!null) count_rows:11(int)
├── grouping columns: ol_d_id:2(int!null) ol_w_id:3(int!null)
├── stats: [rows=100, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=0, distinct(11)=100, null(11)=0, distinct(2,3)=100, null(2,3)=0]
├── cost: 3361369.67
├── cost: 3361369.68
├── key: (2,3)
├── fd: (2,3)-->(11)
├── prune: (11)
Expand Down Expand Up @@ -1249,7 +1249,7 @@ except-all
├── left columns: o_w_id:3(int!null) o_d_id:2(int!null) o_id:1(int!null) o_ol_cnt:7(int)
├── right columns: ol_w_id:11(int) ol_d_id:10(int) ol_o_id:9(int) count_rows:19(int)
├── stats: [rows=300000, distinct(1)=2999, null(1)=0, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=0, distinct(7)=11, null(7)=0]
├── cost: 3769391.14
├── cost: 3769391.15
├── scan "order"
│ ├── save-table-name: consistency_10_scan_2
│ ├── columns: o_id:1(int!null) o_d_id:2(int!null) o_w_id:3(int!null) o_ol_cnt:7(int)
Expand All @@ -1266,7 +1266,7 @@ except-all
├── columns: ol_o_id:9(int!null) ol_d_id:10(int!null) ol_w_id:11(int!null) count_rows:19(int)
├── grouping columns: ol_o_id:9(int!null) ol_d_id:10(int!null) ol_w_id:11(int!null)
├── stats: [rows=299900, distinct(9)=2999, null(9)=0, distinct(10)=10, null(10)=0, distinct(11)=10, null(11)=0, distinct(19)=299900, null(19)=0, distinct(9-11)=299900, null(9-11)=0]
├── cost: 3424392.11
├── cost: 3424392.12
├── key: (9-11)
├── fd: (9-11)-->(19)
├── prune: (19)
Expand Down Expand Up @@ -1331,13 +1331,13 @@ except-all
├── left columns: ol_w_id:3(int!null) ol_d_id:2(int!null) ol_o_id:1(int!null) count_rows:11(int)
├── right columns: o_w_id:14(int) o_d_id:13(int) o_id:12(int) o_ol_cnt:18(int)
├── stats: [rows=299900, distinct(1)=2999, null(1)=0, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=0, distinct(11)=299900, null(11)=0]
├── cost: 3769390.14
├── cost: 3769390.15
├── group-by
│ ├── save-table-name: consistency_11_group_by_2
│ ├── columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null) count_rows:11(int)
│ ├── grouping columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null)
│ ├── stats: [rows=299900, distinct(1)=2999, null(1)=0, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=0, distinct(11)=299900, null(11)=0, distinct(1-3)=299900, null(1-3)=0]
│ ├── cost: 3424392.11
│ ├── cost: 3424392.12
│ ├── key: (1-3)
│ ├── fd: (1-3)-->(11)
│ ├── prune: (11)
Expand Down
Loading

0 comments on commit 3bb1834

Please sign in to comment.