Skip to content

Commit

Permalink
opt: fix statistics estimation for semi and anti joins
Browse files Browse the repository at this point in the history
Prior to this commit, the statisticsBuilder always estimated that
the number of output rows for a semi or anti join was equal to the
number of rows on the left side. It ignored any ON conditions.
This commit improves the estimate by taking into account the ON
conditions.

Release note: None
  • Loading branch information
rytaft committed Aug 28, 2019
1 parent 8e3d82e commit 28d0a38
Show file tree
Hide file tree
Showing 12 changed files with 528 additions and 686 deletions.
110 changes: 82 additions & 28 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -828,19 +828,11 @@ func (sb *statisticsBuilder) buildJoin(
rightCols := h.rightProps.OutputCols.Copy()
equivReps := h.filtersFD.EquivReps()

// Estimating selectivity for semi-join and anti-join is error-prone.
// For now, just propagate stats from the left side.
switch h.joinType {
case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
s.RowCount = leftStats.RowCount
s.Selectivity = 1
return
}

// Shortcut if there are no ON conditions. Note that for lookup join, there
// are implicit equality conditions on KeyCols.
if h.filterIsTrue {
s.RowCount = leftStats.RowCount * rightStats.RowCount
s.Selectivity = 1
switch h.joinType {
case opt.InnerJoinOp, opt.InnerJoinApplyOp:
case opt.LeftJoinOp, opt.LeftJoinApplyOp:
Expand All @@ -855,13 +847,20 @@ func (sb *statisticsBuilder) buildJoin(
// All rows from both sides should be in the result.
s.RowCount = max(s.RowCount, leftStats.RowCount)
s.RowCount = max(s.RowCount, rightStats.RowCount)

case opt.SemiJoinOp, opt.SemiJoinApplyOp:
s.RowCount = leftStats.RowCount

case opt.AntiJoinOp, opt.AntiJoinApplyOp:
s.RowCount = 0
s.Selectivity = 0
}
s.Selectivity = 1
return
}

// Shortcut if the ON condition is false or there is a contradiction.
if h.filters.IsFalse() {
s.Selectivity = 0
switch h.joinType {
case opt.InnerJoinOp, opt.InnerJoinApplyOp:
s.RowCount = 0
Expand All @@ -877,8 +876,14 @@ func (sb *statisticsBuilder) buildJoin(
case opt.FullJoinOp:
// All rows from both sides should be in the result.
s.RowCount = leftStats.RowCount + rightStats.RowCount

case opt.SemiJoinOp, opt.SemiJoinApplyOp:
s.RowCount = 0

case opt.AntiJoinOp, opt.AntiJoinApplyOp:
s.RowCount = leftStats.RowCount
s.Selectivity = 1
}
s.Selectivity = 0
return
}

Expand All @@ -900,16 +905,36 @@ func (sb *statisticsBuilder) buildJoin(

// Calculate selectivity and row count
// -----------------------------------
if h.rightProps.FuncDeps.ColsAreStrictKey(h.selfJoinCols) {
// This is like an index join.
s.RowCount = leftStats.RowCount * rightStats.RowCount
inputRowCount := s.RowCount
switch h.joinType {
case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
// Treat anti join as if it were a semi join for the selectivity
// calculations. It will be fixed below.
s.RowCount = leftStats.RowCount
} else {
s.RowCount = leftStats.RowCount * rightStats.RowCount
equivReps.UnionWith(h.selfJoinCols)
inputRowCount = s.RowCount
selectivity := sb.selectivityFromEquivalencies(equivReps, &h.filtersFD, join, s)

// Multiply the selectivity from equivalencies by the right row count to
// account for the fact that semi/anti joins start from the left side row
// count rather than the cross product.
s.ApplySelectivity(min(rightStats.RowCount*selectivity, 1))

default:
if h.rightProps.FuncDeps.ColsAreStrictKey(h.selfJoinCols) {
// This is like an index join, so apply a selectivity that will result
// in leftStats.RowCount rows.
s.ApplySelectivity(1 / rightStats.RowCount)
} else {
// Add the self join columns to equivReps so they are included in the
// calculation for selectivityFromEquivalencies below.
equivReps.UnionWith(h.selfJoinCols)
}

s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, &h.filtersFD, join, s))
}
inputRowCount := s.RowCount

s.ApplySelectivity(sb.selectivityFromDistinctCounts(constrainedCols, join, s))
s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, &h.filtersFD, join, s))
s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))

// Update distinct counts based on equivalencies; this should happen after
Expand All @@ -919,16 +944,24 @@ func (sb *statisticsBuilder) buildJoin(
// Update null counts for non-nullable columns.
sb.updateNullCountsFromProps(join, relProps, inputRowCount)

s.ApplySelectivity(sb.joinSelectivityFromNullCounts(
constrainedCols,
join,
s,
inputRowCount,
leftCols,
leftStats.RowCount,
rightCols,
rightStats.RowCount,
))
switch h.joinType {
case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
// Keep only column stats from the left side.
s.ColStats.RemoveIntersecting(h.rightProps.OutputCols)
s.ApplySelectivity(sb.selectivityFromNullCounts(constrainedCols, join, s, inputRowCount))

default:
s.ApplySelectivity(sb.joinSelectivityFromNullCounts(
constrainedCols,
join,
s,
inputRowCount,
leftCols,
leftStats.RowCount,
rightCols,
rightStats.RowCount,
))
}

// The above calculation is for inner joins. Other joins need to remove stats
// that involve outer columns.
Expand Down Expand Up @@ -967,6 +1000,27 @@ func (sb *statisticsBuilder) buildJoin(
s.RowCount = leftJoinRowCount + rightJoinRowCount - innerJoinRowCount
}

// Fix the stats for anti join.
switch h.joinType {
case opt.AntiJoinOp, opt.AntiJoinApplyOp:
s.RowCount = max(inputRowCount-s.RowCount, epsilon)
s.Selectivity = max(1-s.Selectivity, epsilon)
for i := 0; i < s.ColStats.Count(); i++ {
colStat := s.ColStats.Get(i)
inputColStat := sb.colStatFromChild(colStat.Cols, join, 0 /* childIdx */)

// Distinct count is tricky for anti-joins. This is a rough estimate,
// accounting for the way distinct counts are calculated above.
colStat.DistinctCount = max(
inputColStat.DistinctCount-colStat.DistinctCount, colStat.DistinctCount,
)
colStat.NullCount = inputColStat.NullCount - colStat.NullCount

// TODO(rytaft): Add a method to subtract histograms from each other.
colStat.Histogram = nil
}
}

// Loop through all colSets added in this step, and adjust null counts and
// distinct counts.
for i := 0; i < s.ColStats.Count(); i++ {
Expand Down
105 changes: 91 additions & 14 deletions pkg/sql/opt/memo/testdata/stats/join
Original file line number Diff line number Diff line change
Expand Up @@ -316,17 +316,17 @@ SELECT * FROM xysd WHERE EXISTS (SELECT * FROM uv WHERE x=u)
----
semi-join (hash)
├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null)
├── stats: [rows=5000]
├── stats: [rows=5000, distinct(1)=500, null(1)=0, distinct(4)=500, null(4)=0]
├── key: (1)
├── fd: (1)-->(2-4), (3,4)~~>(1,2)
├── scan xysd
│ ├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null)
│ ├── stats: [rows=5000]
│ ├── stats: [rows=5000, distinct(1)=5000, null(1)=0, distinct(4)=500, null(4)=0]
│ ├── key: (1)
│ └── fd: (1)-->(2-4), (3,4)~~>(1,2)
├── scan uv
│ ├── columns: u:5(int)
│ └── stats: [rows=10000]
│ └── stats: [rows=10000, distinct(5)=500, null(5)=0]
└── filters
└── x = u [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ]), fd=(1)==(5), (5)==(1)]

Expand All @@ -336,17 +336,17 @@ SELECT * FROM xysd WHERE NOT EXISTS (SELECT * FROM uv WHERE x=u)
----
anti-join (hash)
├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null)
├── stats: [rows=5000]
├── stats: [rows=1e-10, distinct(1)=1e-10, null(1)=0, distinct(4)=1e-10, null(4)=0]
├── key: (1)
├── fd: (1)-->(2-4), (3,4)~~>(1,2)
├── scan xysd
│ ├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null)
│ ├── stats: [rows=5000]
│ ├── stats: [rows=5000, distinct(1)=5000, null(1)=0, distinct(4)=500, null(4)=0]
│ ├── key: (1)
│ └── fd: (1)-->(2-4), (3,4)~~>(1,2)
├── scan uv
│ ├── columns: u:5(int)
│ └── stats: [rows=10000]
│ └── stats: [rows=10000, distinct(5)=500, null(5)=0]
└── filters
└── x = u [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ]), fd=(1)==(5), (5)==(1)]

Expand Down Expand Up @@ -405,21 +405,21 @@ GROUP BY y
----
project
├── columns: count:8(int)
├── stats: [rows=400]
├── stats: [rows=397.482791]
└── group-by
├── columns: y:2(int) count_rows:8(int)
├── grouping columns: y:2(int)
├── stats: [rows=400, distinct(2)=400, null(2)=0]
├── stats: [rows=397.482791, distinct(2)=397.482791, null(2)=0]
├── key: (2)
├── fd: (2)-->(8)
├── semi-join (hash)
│ ├── columns: x:1(int!null) y:2(int)
│ ├── stats: [rows=5000, distinct(2)=400, null(2)=0]
│ ├── stats: [rows=1666.66667, distinct(1)=500, null(1)=0, distinct(2)=397.482791, null(2)=0]
│ ├── key: (1)
│ ├── fd: (1)-->(2)
│ ├── scan xysd
│ │ ├── columns: x:1(int!null) y:2(int)
│ │ ├── stats: [rows=5000, distinct(2)=400, null(2)=0]
│ │ ├── stats: [rows=5000, distinct(1)=5000, null(1)=0, distinct(2)=400, null(2)=0]
│ │ ├── key: (1)
│ │ └── fd: (1)-->(2)
│ ├── scan uv
Expand All @@ -439,21 +439,21 @@ GROUP BY y
----
project
├── columns: count:8(int)
├── stats: [rows=400]
├── stats: [rows=399.999565]
└── group-by
├── columns: y:2(int) count_rows:8(int)
├── grouping columns: y:2(int)
├── stats: [rows=400, distinct(2)=400, null(2)=0]
├── stats: [rows=399.999565, distinct(2)=399.999565, null(2)=0]
├── key: (2)
├── fd: (2)-->(8)
├── anti-join (hash)
│ ├── columns: x:1(int!null) y:2(int)
│ ├── stats: [rows=5000, distinct(2)=400, null(2)=0]
│ ├── stats: [rows=3333.33333, distinct(1)=3333.33333, null(1)=0, distinct(2)=399.999565, null(2)=0]
│ ├── key: (1)
│ ├── fd: (1)-->(2)
│ ├── scan xysd
│ │ ├── columns: x:1(int!null) y:2(int)
│ │ ├── stats: [rows=5000, distinct(2)=400, null(2)=0]
│ │ ├── stats: [rows=5000, distinct(1)=5000, null(1)=0, distinct(2)=400, null(2)=0]
│ │ ├── key: (1)
│ │ └── fd: (1)-->(2)
│ ├── scan uv
Expand Down Expand Up @@ -1257,3 +1257,80 @@ semi-join (lookup def)
│ ├── prune: (1-3)
│ └── interesting orderings: (+1,+2)
└── filters (true)

expr format=show-all colstat=5 colstat=6 colstat=(5, 6) colstat=1 colstat=2 colstat=3 colstat=(1, 2, 3)
(MakeLookupJoin
(Scan [ (Table "abc") (Cols "a,b,c") ])
[ (JoinType "anti-join") (Table "def") (Index "def@primary") (KeyCols "a,b") (Cols "a,b,c,d,e,f") ]
[ ]
)
----
anti-join (lookup def)
├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(int)
├── key columns: [1 2] = [4 5]
├── stats: [rows=1e-10, distinct(1)=1e-10, null(1)=0, distinct(2)=1e-10, null(2)=0, distinct(3)=1e-10, null(3)=1e-10, distinct(5)=1e-10, null(5)=0, distinct(6)=1e-10, null(6)=0, distinct(5,6)=1e-10, null(5,6)=0, distinct(1-3)=1e-10, null(1-3)=1e-10]
├── cost: 506.03
├── key: (1,2)
├── fd: (1,2)-->(3)
├── interesting orderings: (+1,+2)
├── scan t.public.abc
│ ├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(int)
│ ├── stats: [rows=100, distinct(1)=100, null(1)=0, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=1, distinct(1-3)=100, null(1-3)=1]
│ ├── cost: 106.02
│ ├── key: (1,2)
│ ├── fd: (1,2)-->(3)
│ ├── prune: (1-3)
│ └── interesting orderings: (+1,+2)
└── filters (true)

expr format=show-all colstat=5 colstat=6 colstat=(5, 6) colstat=1 colstat=2 colstat=3 colstat=(1, 2, 3)
(MakeLookupJoin
(Scan [ (Table "abc") (Cols "a,b,c") ])
[ (JoinType "semi-join") (Table "def") (Index "def@primary") (KeyCols "a,b") (Cols "a,b,c,d,e,f") ]
[ (False) ]
)
----
semi-join (lookup def)
├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(int)
├── key columns: [1 2] = [4 5]
├── stats: [rows=0, distinct(1)=0, null(1)=0, distinct(2)=0, null(2)=0, distinct(3)=0, null(3)=0, distinct(5)=0, null(5)=0, distinct(6)=0, null(6)=0, distinct(5,6)=0, null(5,6)=0, distinct(1-3)=0, null(1-3)=0]
├── cost: 713.04
├── key: (1,2)
├── fd: (1,2)-->(3)
├── interesting orderings: (+1,+2)
├── scan t.public.abc
│ ├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(int)
│ ├── stats: [rows=100, distinct(1)=100, null(1)=0, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=1, distinct(1-3)=100, null(1-3)=1]
│ ├── cost: 106.02
│ ├── key: (1,2)
│ ├── fd: (1,2)-->(3)
│ ├── prune: (1-3)
│ └── interesting orderings: (+1,+2)
└── filters
└── false [type=bool]

expr format=show-all colstat=5 colstat=6 colstat=(5, 6) colstat=1 colstat=2 colstat=3 colstat=(1, 2, 3)
(MakeLookupJoin
(Scan [ (Table "abc") (Cols "a,b,c") ])
[ (JoinType "anti-join") (Table "def") (Index "def@primary") (KeyCols "a,b") (Cols "a,b,c,d,e,f") ]
[ (False) ]
)
----
anti-join (lookup def)
├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(int)
├── key columns: [1 2] = [4 5]
├── stats: [rows=100, distinct(1)=100, null(1)=0, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=1, distinct(5)=1, null(5)=0, distinct(6)=1, null(6)=0, distinct(5,6)=1, null(5,6)=0, distinct(1-3)=100, null(1-3)=1]
├── cost: 506.04
├── key: (1,2)
├── fd: (1,2)-->(3)
├── interesting orderings: (+1,+2)
├── scan t.public.abc
│ ├── columns: t.public.abc.a:1(int!null) t.public.abc.b:2(int!null) t.public.abc.c:3(int)
│ ├── stats: [rows=100, distinct(1)=100, null(1)=0, distinct(2)=10, null(2)=0, distinct(3)=10, null(3)=1, distinct(1-3)=100, null(1-3)=1]
│ ├── cost: 106.02
│ ├── key: (1,2)
│ ├── fd: (1,2)-->(3)
│ ├── prune: (1-3)
│ └── interesting orderings: (+1,+2)
└── filters
└── false [type=bool]
Loading

0 comments on commit 28d0a38

Please sign in to comment.