From 17d77b8d7300c9fe7cd6df274a663da20fbad184 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Wed, 21 Jun 2023 13:24:46 +0800 Subject: [PATCH 1/4] add --- planner/core/exhaust_physical_plans.go | 79 +++++++++++++++++++++++--- sessionctx/variable/session.go | 3 + statistics/integration_test.go | 15 +++++ 3 files changed, 90 insertions(+), 7 deletions(-) diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go index 415e7beab11d0..cb0b23ca7b1b1 100644 --- a/planner/core/exhaust_physical_plans.go +++ b/planner/core/exhaust_physical_plans.go @@ -32,6 +32,7 @@ import ( "github.com/pingcap/tidb/planner/property" "github.com/pingcap/tidb/planner/util" "github.com/pingcap/tidb/sessionctx" + "github.com/pingcap/tidb/sessionctx/variable" "github.com/pingcap/tidb/statistics" "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/chunk" @@ -1152,13 +1153,76 @@ func (*LogicalJoin) constructInnerUnionScan(us *LogicalUnionScan, reader Physica return physicalUnionScan } +func getColsNDVLowerBoundFromHistColl(cols []*expression.Column, histColl *statistics.HistColl) int64 { + if len(cols) == 0 || histColl == nil { + return -1 + } + colUIDs := make([]int64, len(cols)) + for i, col := range cols { + colUIDs[i] = col.UniqueID + } + + // Note that we don't need to specially handle prefix index in this function, because the NDV of a prefix index is + // equal or less than the corresponding normal index, and that's safe here since we want a lower bound. + + // 1. Try to get NDV from column stats if it's a single column. + if len(colUIDs) == 1 && histColl.Columns != nil { + uid := colUIDs[0] + if colStats, ok := histColl.Columns[uid]; ok && colStats != nil { + return colStats.NDV + } + } + + slices.Sort(colUIDs) + if histColl.Indices == nil || histColl.Idx2ColumnIDs == nil { + return -1 + } + + // 2. Try to get NDV from index stats. + for idxID, idxCols := range histColl.Idx2ColumnIDs { + if len(idxCols) != len(colUIDs) { + continue + } + orderedIdxCols := make([]int64, len(idxCols)) + copy(orderedIdxCols, idxCols) + slices.Sort(orderedIdxCols) + if !slices.Equal(orderedIdxCols, colUIDs) { + continue + } + if idxStats, ok := histColl.Indices[idxID]; ok && idxStats != nil { + return idxStats.NDV + } + } + + // TODO: if there's an index that contains the expected columns, we can also make use of its NDV. + // For example, NDV(a,b,c) / NDV(c) is a safe lower bound of NDV(a,b). + + // 3. If we still haven't got an NDV, we use the minimal NDV in the column stats as a lower bound. + // This would happen when len(cols) > 1 and no proper index stats are available. + minNDV := int64(-1) + for _, colStats := range histColl.Columns { + if colStats == nil || colStats.Info == nil { + continue + } + col := colStats.Info + if col.IsGenerated() && !col.GeneratedStored { + continue + } + if (colStats.NDV > 0 && minNDV <= 0) || + colStats.NDV < minNDV { + minNDV = colStats.NDV + } + } + return minNDV +} + // constructInnerIndexScanTask is specially used to construct the inner plan for PhysicalIndexJoin. func (p *LogicalJoin) constructInnerIndexScanTask( wrapper *indexJoinInnerChildWrapper, path *util.AccessPath, ranges ranger.Ranges, filterConds []expression.Expression, - _ []*expression.Column, + innerJoinKeys []*expression.Column, rangeInfo string, keepOrder bool, desc bool, @@ -1256,12 +1320,13 @@ func (p *LogicalJoin) constructInnerIndexScanTask( // the estimated row count of the IndexScan should be no larger than (total row count / NDV of join key columns). // We use it as an upper bound here. rowCountUpperBound := -1.0 - //if ds.tableStats != nil { - // joinKeyNDV := getColsNDVLowerBoundFromHistColl(innerJoinKeys, ds.tableStats.HistColl) - // if joinKeyNDV > 0 { - // rowCountUpperBound = ds.tableStats.RowCount / float64(joinKeyNDV) - // } - //} + fixValue, ok := ds.ctx.GetSessionVars().GetOptimizerFixControlValue(variable.TiDBOptFixControl44855) + if ok && variable.TiDBOptOn(fixValue) && ds.tableStats != nil { + joinKeyNDV := getColsNDVLowerBoundFromHistColl(innerJoinKeys, ds.tableStats.HistColl) + if joinKeyNDV > 0 { + rowCountUpperBound = ds.tableStats.RowCount / float64(joinKeyNDV) + } + } if rowCountUpperBound > 0 { rowCount = math.Min(rowCount, rowCountUpperBound) diff --git a/sessionctx/variable/session.go b/sessionctx/variable/session.go index 1918e1e63a580..bcc606ed4c8e8 100644 --- a/sessionctx/variable/session.go +++ b/sessionctx/variable/session.go @@ -1518,6 +1518,9 @@ var ( TiDBOptFixControl44262 uint64 = 44262 // TiDBOptFixControl44389 controls whether to consider non-point ranges of some CNF item when building ranges. TiDBOptFixControl44389 uint64 = 44389 + // TiDBOptFixControl44855 controls whether to use a more accurate upper bound when estimating row count of index + // range scan under inner side of index join. + TiDBOptFixControl44855 uint64 = 44855 ) // GetOptimizerFixControlValue returns the specified value of the optimizer fix control. diff --git a/statistics/integration_test.go b/statistics/integration_test.go index 210e14e23d5ec..bc84d82f91fcc 100644 --- a/statistics/integration_test.go +++ b/statistics/integration_test.go @@ -790,6 +790,21 @@ func TestIndexJoinInnerRowCountUpperBound(t *testing.T) { " └─Selection(Probe) 1000000.00 cop[tikv] eq(test.t.a, 0)", " └─TableRowIDScan 500000000.00 cop[tikv] table:t2 keep order:false, stats:pseudo", )) + + testKit.MustExec("set @@tidb_opt_fix_control = '44855:ON'") + testKit.MustQuery("explain format = 'brief' " + + "select /*+ inl_join(t2) */ * from (select * from t where t.a < 1) as t1 join t t2 where t2.a = 0 and t1.a = t2.b"). + Check(testkit.Rows( + "IndexJoin 1000000.00 root inner join, inner:IndexLookUp, outer key:test.t.a, inner key:test.t.b, equal cond:eq(test.t.a, test.t.b)", + "├─TableReader(Build) 1000.00 root data:Selection", + "│ └─Selection 1000.00 cop[tikv] lt(test.t.a, 1), not(isnull(test.t.a))", + "│ └─TableFullScan 500000.00 cop[tikv] table:t keep order:false, stats:pseudo", + "└─IndexLookUp(Probe) 1000000.00 root ", + " ├─Selection(Build) 1000000.00 cop[tikv] not(isnull(test.t.b))", + " │ └─IndexRangeScan 1000000.00 cop[tikv] table:t2, index:idx(b) range: decided by [eq(test.t.b, test.t.a)], keep order:false, stats:pseudo", + " └─Selection(Probe) 1000000.00 cop[tikv] eq(test.t.a, 0)", + " └─TableRowIDScan 1000000.00 cop[tikv] table:t2 keep order:false, stats:pseudo", + )) } func TestOrderingIdxSelectivityThreshold(t *testing.T) { From 3d10e83e4f8690f99e613a4928dcfadb57c6eef0 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Wed, 21 Jun 2023 14:09:22 +0800 Subject: [PATCH 2/4] add comments and simplify --- planner/core/exhaust_physical_plans.go | 2 +- statistics/integration_test.go | 51 +++++++++++++------------- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go index cb0b23ca7b1b1..be13dc0bdeb46 100644 --- a/planner/core/exhaust_physical_plans.go +++ b/planner/core/exhaust_physical_plans.go @@ -1314,7 +1314,7 @@ func (p *LogicalJoin) constructInnerIndexScanTask( is.initSchema(append(path.FullIdxCols, ds.commonHandleCols...), cop.tablePlan != nil) indexConds, tblConds := ds.splitIndexFilterConditions(filterConds, path.FullIdxCols, path.FullIdxColLens) - // Note: due to a regression in JOB workload, we need to revert the logic below for now. + // Note: due to a regression in JOB workload, we use the optimizer fix control to enable this for now. // // Because we are estimating an average row count of the inner side corresponding to each row from the outer side, // the estimated row count of the IndexScan should be no larger than (total row count / NDV of join key columns). diff --git a/statistics/integration_test.go b/statistics/integration_test.go index bc84d82f91fcc..06698c7db8c62 100644 --- a/statistics/integration_test.go +++ b/statistics/integration_test.go @@ -777,34 +777,33 @@ func TestIndexJoinInnerRowCountUpperBound(t *testing.T) { stat := h.GetTableStats(tblInfo) stat.HistColl = mockStatsTbl.HistColl - testKit.MustQuery("explain format = 'brief' " + - "select /*+ inl_join(t2) */ * from (select * from t where t.a < 1) as t1 join t t2 where t2.a = 0 and t1.a = t2.b"). - Check(testkit.Rows( - "IndexJoin 1000000.00 root inner join, inner:IndexLookUp, outer key:test.t.a, inner key:test.t.b, equal cond:eq(test.t.a, test.t.b)", - "├─TableReader(Build) 1000.00 root data:Selection", - "│ └─Selection 1000.00 cop[tikv] lt(test.t.a, 1), not(isnull(test.t.a))", - "│ └─TableFullScan 500000.00 cop[tikv] table:t keep order:false, stats:pseudo", - "└─IndexLookUp(Probe) 1000000.00 root ", - " ├─Selection(Build) 500000000.00 cop[tikv] not(isnull(test.t.b))", - " │ └─IndexRangeScan 500000000.00 cop[tikv] table:t2, index:idx(b) range: decided by [eq(test.t.b, test.t.a)], keep order:false, stats:pseudo", - " └─Selection(Probe) 1000000.00 cop[tikv] eq(test.t.a, 0)", - " └─TableRowIDScan 500000000.00 cop[tikv] table:t2 keep order:false, stats:pseudo", - )) + query := "explain format = 'brief' " + + "select /*+ inl_join(t2) */ * from (select * from t where t.a < 1) as t1 join t t2 where t2.a = 0 and t1.a = t2.b" + + testKit.MustQuery(query).Check(testkit.Rows( + "IndexJoin 1000000.00 root inner join, inner:IndexLookUp, outer key:test.t.a, inner key:test.t.b, equal cond:eq(test.t.a, test.t.b)", + "├─TableReader(Build) 1000.00 root data:Selection", + "│ └─Selection 1000.00 cop[tikv] lt(test.t.a, 1), not(isnull(test.t.a))", + "│ └─TableFullScan 500000.00 cop[tikv] table:t keep order:false, stats:pseudo", + "└─IndexLookUp(Probe) 1000000.00 root ", + " ├─Selection(Build) 500000000.00 cop[tikv] not(isnull(test.t.b))", + " │ └─IndexRangeScan 500000000.00 cop[tikv] table:t2, index:idx(b) range: decided by [eq(test.t.b, test.t.a)], keep order:false, stats:pseudo", + " └─Selection(Probe) 1000000.00 cop[tikv] eq(test.t.a, 0)", + " └─TableRowIDScan 500000000.00 cop[tikv] table:t2 keep order:false, stats:pseudo", + )) testKit.MustExec("set @@tidb_opt_fix_control = '44855:ON'") - testKit.MustQuery("explain format = 'brief' " + - "select /*+ inl_join(t2) */ * from (select * from t where t.a < 1) as t1 join t t2 where t2.a = 0 and t1.a = t2.b"). - Check(testkit.Rows( - "IndexJoin 1000000.00 root inner join, inner:IndexLookUp, outer key:test.t.a, inner key:test.t.b, equal cond:eq(test.t.a, test.t.b)", - "├─TableReader(Build) 1000.00 root data:Selection", - "│ └─Selection 1000.00 cop[tikv] lt(test.t.a, 1), not(isnull(test.t.a))", - "│ └─TableFullScan 500000.00 cop[tikv] table:t keep order:false, stats:pseudo", - "└─IndexLookUp(Probe) 1000000.00 root ", - " ├─Selection(Build) 1000000.00 cop[tikv] not(isnull(test.t.b))", - " │ └─IndexRangeScan 1000000.00 cop[tikv] table:t2, index:idx(b) range: decided by [eq(test.t.b, test.t.a)], keep order:false, stats:pseudo", - " └─Selection(Probe) 1000000.00 cop[tikv] eq(test.t.a, 0)", - " └─TableRowIDScan 1000000.00 cop[tikv] table:t2 keep order:false, stats:pseudo", - )) + testKit.MustQuery(query).Check(testkit.Rows( + "IndexJoin 1000000.00 root inner join, inner:IndexLookUp, outer key:test.t.a, inner key:test.t.b, equal cond:eq(test.t.a, test.t.b)", + "├─TableReader(Build) 1000.00 root data:Selection", + "│ └─Selection 1000.00 cop[tikv] lt(test.t.a, 1), not(isnull(test.t.a))", + "│ └─TableFullScan 500000.00 cop[tikv] table:t keep order:false, stats:pseudo", + "└─IndexLookUp(Probe) 1000000.00 root ", + " ├─Selection(Build) 1000000.00 cop[tikv] not(isnull(test.t.b))", + " │ └─IndexRangeScan 1000000.00 cop[tikv] table:t2, index:idx(b) range: decided by [eq(test.t.b, test.t.a)], keep order:false, stats:pseudo", + " └─Selection(Probe) 1000000.00 cop[tikv] eq(test.t.a, 0)", + " └─TableRowIDScan 1000000.00 cop[tikv] table:t2 keep order:false, stats:pseudo", + )) } func TestOrderingIdxSelectivityThreshold(t *testing.T) { From b6a2bb162af5392531695a8fddd9456ebdbb7b73 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Wed, 21 Jun 2023 18:12:10 +0800 Subject: [PATCH 3/4] update --- planner/core/exhaust_physical_plans.go | 63 +++++++++++++------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go index be13dc0bdeb46..5094123972334 100644 --- a/planner/core/exhaust_physical_plans.go +++ b/planner/core/exhaust_physical_plans.go @@ -38,6 +38,7 @@ import ( "github.com/pingcap/tidb/util/chunk" "github.com/pingcap/tidb/util/collate" "github.com/pingcap/tidb/util/logutil" + "github.com/pingcap/tidb/util/mathutil" "github.com/pingcap/tidb/util/plancodec" "github.com/pingcap/tidb/util/ranger" "github.com/pingcap/tidb/util/set" @@ -950,7 +951,7 @@ func (p *LogicalJoin) buildIndexJoinInner2IndexScan( maxOneRow = ok && (sf.FuncName.L == ast.EQ) } } - innerTask := p.constructInnerIndexScanTask(wrapper, helper.chosenPath, helper.chosenRanges.Range(), helper.chosenRemained, innerJoinKeys, rangeInfo, false, false, avgInnerRowCnt, maxOneRow) + innerTask := p.constructInnerIndexScanTask(wrapper, helper.chosenPath, helper.chosenRanges.Range(), helper.chosenRemained, innerJoinKeys, helper.idxOff2KeyOff, rangeInfo, false, false, avgInnerRowCnt, maxOneRow) failpoint.Inject("MockOnlyEnableIndexHashJoin", func(val failpoint.Value) { if val.(bool) && !p.ctx.GetSessionVars().InRestrictedSQL { failpoint.Return(p.constructIndexHashJoin(prop, outerIdx, innerTask, helper.chosenRanges, keyOff2IdxOff, helper.chosenPath, helper.lastColManager)) @@ -965,7 +966,7 @@ func (p *LogicalJoin) buildIndexJoinInner2IndexScan( // Because we can't keep order for union scan, if there is a union scan in inner task, // we can't construct index merge join. if us == nil { - innerTask2 := p.constructInnerIndexScanTask(wrapper, helper.chosenPath, helper.chosenRanges.Range(), helper.chosenRemained, innerJoinKeys, rangeInfo, true, !prop.IsSortItemEmpty() && prop.SortItems[0].Desc, avgInnerRowCnt, maxOneRow) + innerTask2 := p.constructInnerIndexScanTask(wrapper, helper.chosenPath, helper.chosenRanges.Range(), helper.chosenRemained, innerJoinKeys, helper.idxOff2KeyOff, rangeInfo, true, !prop.IsSortItemEmpty() && prop.SortItems[0].Desc, avgInnerRowCnt, maxOneRow) if innerTask2 != nil { joins = append(joins, p.constructIndexMergeJoin(prop, outerIdx, innerTask2, helper.chosenRanges, keyOff2IdxOff, helper.chosenPath, helper.lastColManager)...) } @@ -1153,32 +1154,24 @@ func (*LogicalJoin) constructInnerUnionScan(us *LogicalUnionScan, reader Physica return physicalUnionScan } -func getColsNDVLowerBoundFromHistColl(cols []*expression.Column, histColl *statistics.HistColl) int64 { - if len(cols) == 0 || histColl == nil { +func getColsNDVLowerBoundFromHistColl(colUIDs []int64, histColl *statistics.HistColl) int64 { + if len(colUIDs) == 0 || histColl == nil { return -1 } - colUIDs := make([]int64, len(cols)) - for i, col := range cols { - colUIDs[i] = col.UniqueID - } - - // Note that we don't need to specially handle prefix index in this function, because the NDV of a prefix index is - // equal or less than the corresponding normal index, and that's safe here since we want a lower bound. // 1. Try to get NDV from column stats if it's a single column. if len(colUIDs) == 1 && histColl.Columns != nil { uid := colUIDs[0] - if colStats, ok := histColl.Columns[uid]; ok && colStats != nil { + if colStats, ok := histColl.Columns[uid]; ok && colStats != nil && colStats.IsStatsInitialized() { return colStats.NDV } } slices.Sort(colUIDs) - if histColl.Indices == nil || histColl.Idx2ColumnIDs == nil { - return -1 - } // 2. Try to get NDV from index stats. + // Note that we don't need to specially handle prefix index here, because the NDV of a prefix index is + // equal or less than the corresponding normal index, and that's safe here since we want a lower bound. for idxID, idxCols := range histColl.Idx2ColumnIDs { if len(idxCols) != len(colUIDs) { continue @@ -1189,7 +1182,7 @@ func getColsNDVLowerBoundFromHistColl(cols []*expression.Column, histColl *stati if !slices.Equal(orderedIdxCols, colUIDs) { continue } - if idxStats, ok := histColl.Indices[idxID]; ok && idxStats != nil { + if idxStats, ok := histColl.Indices[idxID]; ok && idxStats != nil && idxStats.IsStatsInitialized() { return idxStats.NDV } } @@ -1197,23 +1190,17 @@ func getColsNDVLowerBoundFromHistColl(cols []*expression.Column, histColl *stati // TODO: if there's an index that contains the expected columns, we can also make use of its NDV. // For example, NDV(a,b,c) / NDV(c) is a safe lower bound of NDV(a,b). - // 3. If we still haven't got an NDV, we use the minimal NDV in the column stats as a lower bound. + // 3. If we still haven't got an NDV, we use the maximum NDV in the column stats as a lower bound. // This would happen when len(cols) > 1 and no proper index stats are available. - minNDV := int64(-1) - for _, colStats := range histColl.Columns { - if colStats == nil || colStats.Info == nil { + maxNDV := int64(-1) + for _, uid := range colUIDs { + colStats := histColl.Columns[uid] + if colStats == nil || !colStats.IsStatsInitialized() { continue } - col := colStats.Info - if col.IsGenerated() && !col.GeneratedStored { - continue - } - if (colStats.NDV > 0 && minNDV <= 0) || - colStats.NDV < minNDV { - minNDV = colStats.NDV - } + maxNDV = mathutil.Max(maxNDV, colStats.NDV) } - return minNDV + return maxNDV } // constructInnerIndexScanTask is specially used to construct the inner plan for PhysicalIndexJoin. @@ -1222,7 +1209,8 @@ func (p *LogicalJoin) constructInnerIndexScanTask( path *util.AccessPath, ranges ranger.Ranges, filterConds []expression.Expression, - innerJoinKeys []*expression.Column, + _ []*expression.Column, + idxOffset2joinKeyOffset []int, rangeInfo string, keepOrder bool, desc bool, @@ -1318,11 +1306,22 @@ func (p *LogicalJoin) constructInnerIndexScanTask( // // Because we are estimating an average row count of the inner side corresponding to each row from the outer side, // the estimated row count of the IndexScan should be no larger than (total row count / NDV of join key columns). - // We use it as an upper bound here. + // We can calculate the lower bound of the NDV therefore we can get an upper bound of the row count here. rowCountUpperBound := -1.0 fixValue, ok := ds.ctx.GetSessionVars().GetOptimizerFixControlValue(variable.TiDBOptFixControl44855) if ok && variable.TiDBOptOn(fixValue) && ds.tableStats != nil { - joinKeyNDV := getColsNDVLowerBoundFromHistColl(innerJoinKeys, ds.tableStats.HistColl) + usedColIDs := make([]int64, 0) + // We only consider columns in this index that (1) are used to probe as join key, + // and (2) are not prefix column in the index (for which we can't easily get a lower bound) + for idxOffset, joinKeyOffset := range idxOffset2joinKeyOffset { + if joinKeyOffset < 0 || + path.FullIdxColLens[idxOffset] != types.UnspecifiedLength || + path.FullIdxCols[idxOffset] == nil { + continue + } + usedColIDs = append(usedColIDs, path.FullIdxCols[idxOffset].UniqueID) + } + joinKeyNDV := getColsNDVLowerBoundFromHistColl(usedColIDs, ds.tableStats.HistColl) if joinKeyNDV > 0 { rowCountUpperBound = ds.tableStats.RowCount / float64(joinKeyNDV) } From 482c43644b1fa5dd0d02d7a256a92570b8c3312c Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Sun, 25 Jun 2023 13:45:01 +0800 Subject: [PATCH 4/4] update comments --- planner/core/exhaust_physical_plans.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go index 5094123972334..0bee03267e9d0 100644 --- a/planner/core/exhaust_physical_plans.go +++ b/planner/core/exhaust_physical_plans.go @@ -1154,6 +1154,7 @@ func (*LogicalJoin) constructInnerUnionScan(us *LogicalUnionScan, reader Physica return physicalUnionScan } +// getColsNDVLowerBoundFromHistColl tries to get a lower bound of the NDV of columns (whose uniqueIDs are colUIDs). func getColsNDVLowerBoundFromHistColl(colUIDs []int64, histColl *statistics.HistColl) int64 { if len(colUIDs) == 0 || histColl == nil { return -1 @@ -1191,7 +1192,6 @@ func getColsNDVLowerBoundFromHistColl(colUIDs []int64, histColl *statistics.Hist // For example, NDV(a,b,c) / NDV(c) is a safe lower bound of NDV(a,b). // 3. If we still haven't got an NDV, we use the maximum NDV in the column stats as a lower bound. - // This would happen when len(cols) > 1 and no proper index stats are available. maxNDV := int64(-1) for _, uid := range colUIDs { colStats := histColl.Columns[uid]