Skip to content

Commit

Permalink
planner: Move the Selectivity function from the stats package into ca…
Browse files Browse the repository at this point in the history
…rdinality package (#46410)

ref #46358
  • Loading branch information
qw4990 authored Aug 25, 2023
1 parent 2adb1dc commit 66ddb7b
Show file tree
Hide file tree
Showing 14 changed files with 436 additions and 419 deletions.
4 changes: 3 additions & 1 deletion planner/cardinality/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ go_library(
"//types",
"//types/parser_driver",
"//util/chunk",
"//util/codec",
"//util/collate",
"//util/logutil",
"//util/ranger",
"//util/tracing",
Expand All @@ -47,7 +49,7 @@ go_test(
data = glob(["testdata/**"]),
embed = [":cardinality"],
flaky = True,
shard_count = 29,
shard_count = 30,
deps = [
"//config",
"//domain",
Expand Down
2 changes: 2 additions & 0 deletions planner/cardinality/row_count_column.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ func init() {
statistics.GetRowCountByColumnRanges = GetRowCountByColumnRanges
statistics.GetRowCountByIntColumnRanges = GetRowCountByIntColumnRanges
statistics.GetRowCountByIndexRanges = GetRowCountByIndexRanges
statistics.OutOfRangeEQSelectivity = outOfRangeEQSelectivity
statistics.GetEqualCondSelectivity = getEqualCondSelectivity
}

// GetRowCountByColumnRanges estimates the row count by a slice of Range.
Expand Down
333 changes: 331 additions & 2 deletions planner/cardinality/selectivity.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,19 @@ import (
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/statistics"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/chunk"
"github.com/pingcap/tidb/util/codec"
"github.com/pingcap/tidb/util/collate"
"github.com/pingcap/tidb/util/logutil"
"github.com/pingcap/tidb/util/ranger"
"go.uber.org/zap"
"golang.org/x/exp/maps"
)

var (
outOfRangeBetweenRate int64 = 100
)

// Selectivity is a function calculate the selectivity of the expressions on the specified HistColl.
// The definition of selectivity is (row count after filter / row count before filter).
// And exprs must be CNF now, in other words, `exprs[0] and exprs[1] and ... and exprs[len - 1]`
Expand Down Expand Up @@ -350,7 +357,7 @@ OUTER:
// Try to cover remaining string matching functions by evaluating the expressions with TopN to estimate.
if ctx.GetSessionVars().EnableEvalTopNEstimationForStrMatch() {
for i, scalarCond := range notCoveredStrMatch {
ok, sel, err := coll.GetSelectivityByFilter(ctx, []expression.Expression{scalarCond})
ok, sel, err := GetSelectivityByFilter(ctx, coll, []expression.Expression{scalarCond})
if err != nil {
sc.AppendWarning(errors.New("Error when using TopN-assisted estimation: " + err.Error()))
}
Expand All @@ -365,7 +372,7 @@ OUTER:
}
}
for i, scalarCond := range notCoveredNegateStrMatch {
ok, sel, err := coll.GetSelectivityByFilter(ctx, []expression.Expression{scalarCond})
ok, sel, err := GetSelectivityByFilter(ctx, coll, []expression.Expression{scalarCond})
if err != nil {
sc.AppendWarning(errors.New("Error when using TopN-assisted estimation: " + err.Error()))
}
Expand Down Expand Up @@ -613,3 +620,325 @@ func getMaskAndRanges(ctx sessionctx.Context, exprs []expression.Expression, ran
}
return mask, ranges, false, nil
}

// GetSelectivityByFilter try to estimate selectivity of expressions by evaluate the expressions using TopN, Histogram buckets boundaries and NULL.
// Currently, this method can only handle expressions involving a single column.
func GetSelectivityByFilter(sctx sessionctx.Context, coll *statistics.HistColl, filters []expression.Expression) (ok bool, selectivity float64, err error) {
// 1. Make sure the expressions
// (1) are safe to be evaluated here,
// (2) involve only one column,
// (3) and this column is not a "new collation" string column so that we're able to restore values from the stats.
for _, filter := range filters {
if expression.IsMutableEffectsExpr(filter) {
return false, 0, nil
}
}
if expression.ContainCorrelatedColumn(filters) {
return false, 0, nil
}
cols := expression.ExtractColumnsFromExpressions(nil, filters, nil)
if len(cols) != 1 {
return false, 0, nil
}
col := cols[0]
tp := col.RetType
if types.IsString(tp.GetType()) && collate.NewCollationEnabled() && !collate.IsBinCollation(tp.GetCollate()) {
return false, 0, nil
}

// 2. Get the available stats, make sure it's a ver2 stats and get the needed data structure from it.
isIndex, i := findAvailableStatsForCol(sctx, coll, col.UniqueID)
if i < 0 {
return false, 0, nil
}
var statsVer, nullCnt int64
var histTotalCnt, totalCnt float64
var topnTotalCnt uint64
var hist *statistics.Histogram
var topn *statistics.TopN
if isIndex {
stats := coll.Indices[i]
statsVer = stats.StatsVer
hist = &stats.Histogram
nullCnt = hist.NullCount
topn = stats.TopN
} else {
stats := coll.Columns[i]
statsVer = stats.StatsVer
hist = &stats.Histogram
nullCnt = hist.NullCount
topn = stats.TopN
}
// Only in stats ver2, we can assume that: TopN + Histogram + NULL == All data
if statsVer != statistics.Version2 {
return false, 0, nil
}
topnTotalCnt = topn.TotalCount()
histTotalCnt = hist.NotNullCount()
totalCnt = float64(topnTotalCnt) + histTotalCnt + float64(nullCnt)

var topNSel, histSel, nullSel float64

// Prepare for evaluation.

// For execution, we use Column.Index instead of Column.UniqueID to locate a column.
// We have only one column here, so we set it to 0.
originalIndex := col.Index
col.Index = 0
defer func() {
// Restore the original Index to avoid unexpected situation.
col.Index = originalIndex
}()
topNLen := 0
histBucketsLen := hist.Len()
if topn != nil {
topNLen = len(topn.TopN)
}
c := chunk.NewChunkWithCapacity([]*types.FieldType{tp}, max(1, topNLen))
selected := make([]bool, 0, max(histBucketsLen, topNLen))

// 3. Calculate the TopN part selectivity.
// This stage is considered as the core functionality of this method, errors in this stage would make this entire method fail.
var topNSelectedCnt uint64
if topn != nil {
for _, item := range topn.TopN {
_, val, err := codec.DecodeOne(item.Encoded)
if err != nil {
return false, 0, err
}
c.AppendDatum(0, &val)
}
selected, err = expression.VectorizedFilter(sctx, filters, chunk.NewIterator4Chunk(c), selected)
if err != nil {
return false, 0, err
}
for i, isTrue := range selected {
if isTrue {
topNSelectedCnt += topn.TopN[i].Count
}
}
}
topNSel = float64(topNSelectedCnt) / totalCnt

// 4. Calculate the Histogram part selectivity.
// The buckets upper bounds and the Bucket.Repeat are used like the TopN above.
// The buckets lower bounds are used as random samples and are regarded equally.
if hist != nil && histTotalCnt > 0 {
selected = selected[:0]
selected, err = expression.VectorizedFilter(sctx, filters, chunk.NewIterator4Chunk(hist.Bounds), selected)
if err != nil {
return false, 0, err
}
var bucketRepeatTotalCnt, bucketRepeatSelectedCnt, lowerBoundMatchCnt int64
for i := range hist.Buckets {
bucketRepeatTotalCnt += hist.Buckets[i].Repeat
if len(selected) < 2*i {
// This should not happen, but we add this check for safety.
break
}
if selected[2*i] {
lowerBoundMatchCnt++
}
if selected[2*i+1] {
bucketRepeatSelectedCnt += hist.Buckets[i].Repeat
}
}
var lowerBoundsRatio, upperBoundsRatio, lowerBoundsSel, upperBoundsSel float64
upperBoundsRatio = min(float64(bucketRepeatTotalCnt)/histTotalCnt, 1)
lowerBoundsRatio = 1 - upperBoundsRatio
if bucketRepeatTotalCnt > 0 {
upperBoundsSel = float64(bucketRepeatSelectedCnt) / float64(bucketRepeatTotalCnt)
}
lowerBoundsSel = float64(lowerBoundMatchCnt) / float64(histBucketsLen)
histSel = lowerBoundsSel*lowerBoundsRatio + upperBoundsSel*upperBoundsRatio
histSel *= histTotalCnt / totalCnt
}

// 5. Calculate the NULL part selectivity.
// Errors in this staged would be returned, but would not make this entire method fail.
c.Reset()
c.AppendNull(0)
selected = selected[:0]
selected, err = expression.VectorizedFilter(sctx, filters, chunk.NewIterator4Chunk(c), selected)
if err != nil || len(selected) != 1 || !selected[0] {
nullSel = 0
} else {
nullSel = float64(nullCnt) / totalCnt
}

// 6. Get the final result.
res := topNSel + histSel + nullSel
return true, res, err
}

func findAvailableStatsForCol(sctx sessionctx.Context, coll *statistics.HistColl, uniqueID int64) (isIndex bool, idx int64) {
// try to find available stats in column stats
if colStats, ok := coll.Columns[uniqueID]; ok && colStats != nil && !colStats.IsInvalid(sctx, coll.Pseudo) && colStats.IsFullLoad() {
return false, uniqueID
}
// try to find available stats in single column index stats (except for prefix index)
for idxStatsIdx, cols := range coll.Idx2ColumnIDs {
if len(cols) == 1 && cols[0] == uniqueID {
idxStats, ok := coll.Indices[idxStatsIdx]
if ok &&
idxStats.Info.Columns[0].Length == types.UnspecifiedLength &&
!idxStats.IsInvalid(sctx, coll.Pseudo) &&
idxStats.IsFullLoad() {
return true, idxStatsIdx
}
}
}
return false, -1
}

// getEqualCondSelectivity gets the selectivity of the equal conditions.
func getEqualCondSelectivity(sctx sessionctx.Context, coll *statistics.HistColl, idx *statistics.Index, bytes []byte,
usedColsLen int, idxPointRange *ranger.Range) (result float64, err error) {
if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace {
debugtrace.EnterContextCommon(sctx)
defer func() {
var idxName string
if idx != nil && idx.Info != nil {
idxName = idx.Info.Name.O
}
debugtrace.RecordAnyValuesWithNames(
sctx,
"Index Name", idxName,
"Encoded", bytes,
"UsedColLen", usedColsLen,
"Range", idxPointRange.String(),
"Result", result,
"error", err,
)
debugtrace.LeaveContextCommon(sctx)
}()
}
coverAll := len(idx.Info.Columns) == usedColsLen
// In this case, the row count is at most 1.
if idx.Info.Unique && coverAll {
return 1.0 / idx.TotalRowCount(), nil
}
val := types.NewBytesDatum(bytes)
if idx.OutOfRangeOnIndex(val) {
// When the value is out of range, we could not found this value in the CM Sketch,
// so we use heuristic methods to estimate the selectivity.
if idx.NDV > 0 && coverAll {
return outOfRangeEQSelectivity(sctx, idx.NDV, coll.RealtimeCount, int64(idx.TotalRowCount())), nil
}
// The equal condition only uses prefix columns of the index.
colIDs := coll.Idx2ColumnIDs[idx.ID]
var ndv int64
for i, colID := range colIDs {
if i >= usedColsLen {
break
}
if col, ok := coll.Columns[colID]; ok {
ndv = max(ndv, col.Histogram.NDV)
}
}
return outOfRangeEQSelectivity(sctx, ndv, coll.RealtimeCount, int64(idx.TotalRowCount())), nil
}

minRowCount, crossValidSelectivity, err := crossValidationSelectivity(sctx, coll, idx, usedColsLen, idxPointRange)
if err != nil {
return 0, err
}

idxCount := float64(idx.QueryBytes(sctx, bytes))
if minRowCount < idxCount {
return crossValidSelectivity, nil
}
return idxCount / idx.TotalRowCount(), nil
}

// outOfRangeEQSelectivity estimates selectivities for out-of-range values.
// It assumes all modifications are insertions and all new-inserted rows are uniformly distributed
// and has the same distribution with analyzed rows, which means each unique value should have the
// same number of rows(Tot/NDV) of it.
// The input sctx is just for debug trace, you can pass nil safely if that's not needed.
func outOfRangeEQSelectivity(sctx sessionctx.Context, ndv, realtimeRowCount, columnRowCount int64) (result float64) {
if sctx != nil && sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace {
debugtrace.EnterContextCommon(sctx)
defer func() {
debugtrace.RecordAnyValuesWithNames(sctx, "Result", result)
debugtrace.LeaveContextCommon(sctx)
}()
}
increaseRowCount := realtimeRowCount - columnRowCount
if increaseRowCount <= 0 {
return 0 // it must be 0 since the histogram contains the whole data
}
if ndv < outOfRangeBetweenRate {
ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV
}
selectivity := 1 / float64(ndv)
if selectivity*float64(columnRowCount) > float64(increaseRowCount) {
selectivity = float64(increaseRowCount) / float64(columnRowCount)
}
return selectivity
}

// crossValidationSelectivity gets the selectivity of multi-column equal conditions by cross validation.
func crossValidationSelectivity(
sctx sessionctx.Context,
coll *statistics.HistColl,
idx *statistics.Index,
usedColsLen int,
idxPointRange *ranger.Range,
) (
minRowCount float64,
crossValidationSelectivity float64,
err error,
) {
if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace {
debugtrace.EnterContextCommon(sctx)
defer func() {
var idxName string
if idx != nil && idx.Info != nil {
idxName = idx.Info.Name.O
}
debugtrace.RecordAnyValuesWithNames(
sctx,
"Index Name", idxName,
"minRowCount", minRowCount,
"crossValidationSelectivity", crossValidationSelectivity,
"error", err,
)
debugtrace.LeaveContextCommon(sctx)
}()
}
minRowCount = math.MaxFloat64
cols := coll.Idx2ColumnIDs[idx.ID]
crossValidationSelectivity = 1.0
totalRowCount := idx.TotalRowCount()
for i, colID := range cols {
if i >= usedColsLen {
break
}
if col, ok := coll.Columns[colID]; ok {
if col.IsInvalid(sctx, coll.Pseudo) {
continue
}
// Since the column range is point range(LowVal is equal to HighVal), we need to set both LowExclude and HighExclude to false.
// Otherwise we would get 0.0 estRow from GetColumnRowCount.
rang := ranger.Range{
LowVal: []types.Datum{idxPointRange.LowVal[i]},
LowExclude: false,
HighVal: []types.Datum{idxPointRange.HighVal[i]},
HighExclude: false,
Collators: []collate.Collator{idxPointRange.Collators[i]},
}

rowCount, err := col.GetColumnRowCount(sctx, []*ranger.Range{&rang}, coll.RealtimeCount, coll.ModifyCount, col.IsHandle)
if err != nil {
return 0, 0, err
}
crossValidationSelectivity = crossValidationSelectivity * (rowCount / totalRowCount)

if rowCount < minRowCount {
minRowCount = rowCount
}
}
}
return minRowCount, crossValidationSelectivity, nil
}
Loading

0 comments on commit 66ddb7b

Please sign in to comment.