opt: fix statistics estimation for semi and anti joins

Prior to this commit, the statisticsBuilder always estimated that the number of output rows for a semi or anti join was equal to the number of rows on the left side. It ignored any ON conditions. This commit improves the estimate by taking into account the ON conditions. Release note: None
cockroachdb · Aug 28, 2019 · 28d0a38 · 28d0a38
1 parent 8e3d82e
commit 28d0a38
Show file tree

Hide file tree

Showing 12 changed files with 528 additions and 686 deletions.
diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -828,19 +828,11 @@ func (sb *statisticsBuilder) buildJoin(
 	rightCols := h.rightProps.OutputCols.Copy()
 	equivReps := h.filtersFD.EquivReps()
 
-	// Estimating selectivity for semi-join and anti-join is error-prone.
-	// For now, just propagate stats from the left side.
-	switch h.joinType {
-	case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
-		s.RowCount = leftStats.RowCount
-		s.Selectivity = 1
-		return
-	}
-
 	// Shortcut if there are no ON conditions. Note that for lookup join, there
 	// are implicit equality conditions on KeyCols.
 	if h.filterIsTrue {
 		s.RowCount = leftStats.RowCount * rightStats.RowCount
+		s.Selectivity = 1
 		switch h.joinType {
 		case opt.InnerJoinOp, opt.InnerJoinApplyOp:
 		case opt.LeftJoinOp, opt.LeftJoinApplyOp:
@@ -855,13 +847,20 @@ func (sb *statisticsBuilder) buildJoin(
 			// All rows from both sides should be in the result.
 			s.RowCount = max(s.RowCount, leftStats.RowCount)
 			s.RowCount = max(s.RowCount, rightStats.RowCount)
+
+		case opt.SemiJoinOp, opt.SemiJoinApplyOp:
+			s.RowCount = leftStats.RowCount
+
+		case opt.AntiJoinOp, opt.AntiJoinApplyOp:
+			s.RowCount = 0
+			s.Selectivity = 0
 		}
-		s.Selectivity = 1
 		return
 	}
 
 	// Shortcut if the ON condition is false or there is a contradiction.
 	if h.filters.IsFalse() {
+		s.Selectivity = 0
 		switch h.joinType {
 		case opt.InnerJoinOp, opt.InnerJoinApplyOp:
 			s.RowCount = 0
@@ -877,8 +876,14 @@ func (sb *statisticsBuilder) buildJoin(
 		case opt.FullJoinOp:
 			// All rows from both sides should be in the result.
 			s.RowCount = leftStats.RowCount + rightStats.RowCount
+
+		case opt.SemiJoinOp, opt.SemiJoinApplyOp:
+			s.RowCount = 0
+
+		case opt.AntiJoinOp, opt.AntiJoinApplyOp:
+			s.RowCount = leftStats.RowCount
+			s.Selectivity = 1
 		}
-		s.Selectivity = 0
 		return
 	}
 
@@ -900,16 +905,36 @@ func (sb *statisticsBuilder) buildJoin(
 
 	// Calculate selectivity and row count
 	// -----------------------------------
-	if h.rightProps.FuncDeps.ColsAreStrictKey(h.selfJoinCols) {
-		// This is like an index join.
+	s.RowCount = leftStats.RowCount * rightStats.RowCount
+	inputRowCount := s.RowCount
+	switch h.joinType {
+	case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
+		// Treat anti join as if it were a semi join for the selectivity
+		// calculations. It will be fixed below.
 		s.RowCount = leftStats.RowCount
-	} else {
-		s.RowCount = leftStats.RowCount * rightStats.RowCount
-		equivReps.UnionWith(h.selfJoinCols)
+		inputRowCount = s.RowCount
+		selectivity := sb.selectivityFromEquivalencies(equivReps, &h.filtersFD, join, s)
+
+		// Multiply the selectivity from equivalencies by the right row count to
+		// account for the fact that semi/anti joins start from the left side row
+		// count rather than the cross product.
+		s.ApplySelectivity(min(rightStats.RowCount*selectivity, 1))
+
+	default:
+		if h.rightProps.FuncDeps.ColsAreStrictKey(h.selfJoinCols) {
+			// This is like an index join, so apply a selectivity that will result
+			// in leftStats.RowCount rows.
+			s.ApplySelectivity(1 / rightStats.RowCount)
+		} else {
+			// Add the self join columns to equivReps so they are included in the
+			// calculation for selectivityFromEquivalencies below.
+			equivReps.UnionWith(h.selfJoinCols)
+		}
+
+		s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, &h.filtersFD, join, s))
 	}
-	inputRowCount := s.RowCount
+
 	s.ApplySelectivity(sb.selectivityFromDistinctCounts(constrainedCols, join, s))
-	s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, &h.filtersFD, join, s))
 	s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
 
 	// Update distinct counts based on equivalencies; this should happen after
@@ -919,16 +944,24 @@ func (sb *statisticsBuilder) buildJoin(
 	// Update null counts for non-nullable columns.
 	sb.updateNullCountsFromProps(join, relProps, inputRowCount)
 
-	s.ApplySelectivity(sb.joinSelectivityFromNullCounts(
-		constrainedCols,
-		join,
-		s,
-		inputRowCount,
-		leftCols,
-		leftStats.RowCount,
-		rightCols,
-		rightStats.RowCount,
-	))
+	switch h.joinType {
+	case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
+		// Keep only column stats from the left side.
+		s.ColStats.RemoveIntersecting(h.rightProps.OutputCols)
+		s.ApplySelectivity(sb.selectivityFromNullCounts(constrainedCols, join, s, inputRowCount))
+
+	default:
+		s.ApplySelectivity(sb.joinSelectivityFromNullCounts(
+			constrainedCols,
+			join,
+			s,
+			inputRowCount,
+			leftCols,
+			leftStats.RowCount,
+			rightCols,
+			rightStats.RowCount,
+		))
+	}
 
 	// The above calculation is for inner joins. Other joins need to remove stats
 	// that involve outer columns.
@@ -967,6 +1000,27 @@ func (sb *statisticsBuilder) buildJoin(
 		s.RowCount = leftJoinRowCount + rightJoinRowCount - innerJoinRowCount
 	}
 
+	// Fix the stats for anti join.
+	switch h.joinType {
+	case opt.AntiJoinOp, opt.AntiJoinApplyOp:
+		s.RowCount = max(inputRowCount-s.RowCount, epsilon)
+		s.Selectivity = max(1-s.Selectivity, epsilon)
+		for i := 0; i < s.ColStats.Count(); i++ {
+			colStat := s.ColStats.Get(i)
+			inputColStat := sb.colStatFromChild(colStat.Cols, join, 0 /* childIdx */)
+
+			// Distinct count is tricky for anti-joins. This is a rough estimate,
+			// accounting for the way distinct counts are calculated above.
+			colStat.DistinctCount = max(
+				inputColStat.DistinctCount-colStat.DistinctCount, colStat.DistinctCount,
+			)
+			colStat.NullCount = inputColStat.NullCount - colStat.NullCount
+
+			// TODO(rytaft): Add a method to subtract histograms from each other.
+			colStat.Histogram = nil
+		}
+	}
+
 	// Loop through all colSets added in this step, and adjust null counts and
 	// distinct counts.
 	for i := 0; i < s.ColStats.Count(); i++ {

diff --git a/pkg/sql/opt/memo/testdata/stats/join b/pkg/sql/opt/memo/testdata/stats/join
@@ -316,17 +316,17 @@ SELECT * FROM xysd WHERE EXISTS (SELECT * FROM uv WHERE x=u)
 ----
 semi-join (hash)
  ├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null)
- ├── stats: [rows=5000]
+ ├── stats: [rows=5000, distinct(1)=500, null(1)=0, distinct(4)=500, null(4)=0]
  ├── key: (1)
  ├── fd: (1)-->(2-4), (3,4)~~>(1,2)
  ├── scan xysd
  │    ├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null)
- │    ├── stats: [rows=5000]
+ │    ├── stats: [rows=5000, distinct(1)=5000, null(1)=0, distinct(4)=500, null(4)=0]
  │    ├── key: (1)
  │    └── fd: (1)-->(2-4), (3,4)~~>(1,2)
  ├── scan uv
  │    ├── columns: u:5(int)
- │    └── stats: [rows=10000]
+ │    └── stats: [rows=10000, distinct(5)=500, null(5)=0]
  └── filters
       └── x = u [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ]), fd=(1)==(5), (5)==(1)]
 
@@ -336,17 +336,17 @@ SELECT * FROM xysd WHERE NOT EXISTS (SELECT * FROM uv WHERE x=u)
 ----
 anti-join (hash)
  ├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null)
- ├── stats: [rows=5000]
+ ├── stats: [rows=1e-10, distinct(1)=1e-10, null(1)=0, distinct(4)=1e-10, null(4)=0]
  ├── key: (1)
  ├── fd: (1)-->(2-4), (3,4)~~>(1,2)
  ├── scan xysd
  │    ├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null)
- │    ├── stats: [rows=5000]
+ │    ├── stats: [rows=5000, distinct(1)=5000, null(1)=0, distinct(4)=500, null(4)=0]
  │    ├── key: (1)
  │    └── fd: (1)-->(2-4), (3,4)~~>(1,2)
  ├── scan uv
  │    ├── columns: u:5(int)
- │    └── stats: [rows=10000]
+ │    └── stats: [rows=10000, distinct(5)=500, null(5)=0]
  └── filters
       └── x = u [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ]), fd=(1)==(5), (5)==(1)]
 
@@ -405,21 +405,21 @@ GROUP BY y
 ----
 project
  ├── columns: count:8(int)
- ├── stats: [rows=400]
+ ├── stats: [rows=397.482791]
  └── group-by
       ├── columns: y:2(int) count_rows:8(int)
       ├── grouping columns: y:2(int)
-      ├── stats: [rows=400, distinct(2)=400, null(2)=0]
+      ├── stats: [rows=397.482791, distinct(2)=397.482791, null(2)=0]
       ├── key: (2)
       ├── fd: (2)-->(8)
       ├── semi-join (hash)
       │    ├── columns: x:1(int!null) y:2(int)
-      │    ├── stats: [rows=5000, distinct(2)=400, null(2)=0]
+      │    ├── stats: [rows=1666.66667, distinct(1)=500, null(1)=0, distinct(2)=397.482791, null(2)=0]
       │    ├── key: (1)
       │    ├── fd: (1)-->(2)
       │    ├── scan xysd
       │    │    ├── columns: x:1(int!null) y:2(int)
-      │    │    ├── stats: [rows=5000, distinct(2)=400, null(2)=0]
+      │    │    ├── stats: [rows=5000, distinct(1)=5000, null(1)=0, distinct(2)=400, null(2)=0]
       │    │    ├── key: (1)
       │    │    └── fd: (1)-->(2)
       │    ├── scan uv
@@ -439,21 +439,21 @@ GROUP BY y
 ----
 project
  ├── columns: count:8(int)
- ├── stats: [rows=400]
+ ├── stats: [rows=399.999565]
  └── group-by
       ├── columns: y:2(int) count_rows:8(int)
       ├── grouping columns: y:2(int)
-      ├── stats: [rows=400, distinct(2)=400, null(2)=0]
+      ├── stats: [rows=399.999565, distinct(2)=399.999565, null(2)=0]
       ├── key: (2)
       ├── fd: (2)-->(8)
       ├── anti-join (hash)
       │    ├── columns: x:1(int!null) y:2(int)
-      │    ├── stats: [rows=5000, distinct(2)=400, null(2)=0]
+      │    ├── stats: [rows=3333.33333, distinct(1)=3333.33333, null(1)=0, distinct(2)=399.999565, null(2)=0]
       │    ├── key: (1)
       │    ├── fd: (1)-->(2)
       │    ├── scan xysd
       │    │    ├── columns: x:1(int!null) y:2(int)
-      │    │    ├── stats: [rows=5000, distinct(2)=400, null(2)=0]
+      │    │    ├── stats: [rows=5000, distinct(1)=5000, null(1)=0, distinct(2)=400, null(2)=0]
       │    │    ├── key: (1)
       │    │    └── fd: (1)-->(2)
       │    ├── scan uv
@@ -1257,3 +1257,80 @@ semi-join (lookup def)
  │    ├── prune: (1-3)
  │    └── interesting orderings: (+1,+2)
  └── filters (true)
+
+expr format=show-all colstat=5 colstat=6 colstat=(5, 6) colstat=1 colstat=2 colstat=3 colstat=(1, 2, 3)
+(MakeLookupJoin
+  (Scan [ (Table "abc") (Cols "a,b,c") ])
+  [ (JoinType "anti-join") (Table "def") (Index "def@primary") (KeyCols "a,b") (Cols "a,b,c,d,e,f") ]
+  [ ]
+)
+----
+anti-join (lookup def)
+ ├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(int)
+ ├── key columns: [1 2] = [4 5]
+ ├── stats: [rows=1e-10, distinct(1)=1e-10, null(1)=0, distinct(2)=1e-10, null(2)=0, distinct(3)=1e-10, null(3)=1e-10, distinct(5)=1e-10, null(5)=0, distinct(6)=1e-10, null(6)=0, distinct(5,6)=1e-10, null(5,6)=0, distinct(1-3)=1e-10, null(1-3)=1e-10]
+ ├── cost: 506.03
+ ├── key: (1,2)
+ ├── fd: (1,2)-->(3)
+ ├── interesting orderings: (+1,+2)
+ ├── scan t.public.abc
+ │    ├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(int)
+ │    ├── stats: [rows=100, distinct(1)=100, null(1)=0, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=1, distinct(1-3)=100, null(1-3)=1]
+ │    ├── cost: 106.02
+ │    ├── key: (1,2)
+ │    ├── fd: (1,2)-->(3)
+ │    ├── prune: (1-3)
+ │    └── interesting orderings: (+1,+2)
+ └── filters (true)
+
+expr format=show-all colstat=5 colstat=6 colstat=(5, 6) colstat=1 colstat=2 colstat=3 colstat=(1, 2, 3)
+(MakeLookupJoin
+  (Scan [ (Table "abc") (Cols "a,b,c") ])
+  [ (JoinType "semi-join") (Table "def") (Index "def@primary") (KeyCols "a,b") (Cols "a,b,c,d,e,f") ]
+  [ (False) ]
+)
+----
+semi-join (lookup def)
+ ├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(int)
+ ├── key columns: [1 2] = [4 5]
+ ├── stats: [rows=0, distinct(1)=0, null(1)=0, distinct(2)=0, null(2)=0, distinct(3)=0, null(3)=0, distinct(5)=0, null(5)=0, distinct(6)=0, null(6)=0, distinct(5,6)=0, null(5,6)=0, distinct(1-3)=0, null(1-3)=0]
+ ├── cost: 713.04
+ ├── key: (1,2)
+ ├── fd: (1,2)-->(3)
+ ├── interesting orderings: (+1,+2)
+ ├── scan t.public.abc
+ │    ├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(int)
+ │    ├── stats: [rows=100, distinct(1)=100, null(1)=0, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=1, distinct(1-3)=100, null(1-3)=1]
+ │    ├── cost: 106.02
+ │    ├── key: (1,2)
+ │    ├── fd: (1,2)-->(3)
+ │    ├── prune: (1-3)
+ │    └── interesting orderings: (+1,+2)
+ └── filters
+      └── false [type=bool]
+
+expr format=show-all colstat=5 colstat=6 colstat=(5, 6) colstat=1 colstat=2 colstat=3 colstat=(1, 2, 3)
+(MakeLookupJoin
+  (Scan [ (Table "abc") (Cols "a,b,c") ])
+  [ (JoinType "anti-join") (Table "def") (Index "def@primary") (KeyCols "a,b") (Cols "a,b,c,d,e,f") ]
+  [ (False) ]
+)
+----
+anti-join (lookup def)
+ ├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(int)
+ ├── key columns: [1 2] = [4 5]
+ ├── stats: [rows=100, distinct(1)=100, null(1)=0, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=1, distinct(5)=1, null(5)=0, distinct(6)=1, null(6)=0, distinct(5,6)=1, null(5,6)=0, distinct(1-3)=100, null(1-3)=1]
+ ├── cost: 506.04
+ ├── key: (1,2)
+ ├── fd: (1,2)-->(3)
+ ├── interesting orderings: (+1,+2)
+ ├── scan t.public.abc
+ │    ├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(int)
+ │    ├── stats: [rows=100, distinct(1)=100, null(1)=0, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=1, distinct(1-3)=100, null(1-3)=1]
+ │    ├── cost: 106.02
+ │    ├── key: (1,2)
+ │    ├── fd: (1,2)-->(3)
+ │    ├── prune: (1-3)
+ │    └── interesting orderings: (+1,+2)
+ └── filters
+      └── false [type=bool]