planner: Move the Selectivity function from the stats package into ca…

…rdinality package (#46410) ref #46358
pingcap · Aug 25, 2023 · 66ddb7b · 66ddb7b
1 parent 2adb1dc
commit 66ddb7b
Show file tree

Hide file tree

Showing 14 changed files with 436 additions and 419 deletions.
diff --git a/planner/cardinality/BUILD.bazel b/planner/cardinality/BUILD.bazel
@@ -27,6 +27,8 @@ go_library(
         "//types",
         "//types/parser_driver",
         "//util/chunk",
+        "//util/codec",
+        "//util/collate",
         "//util/logutil",
         "//util/ranger",
         "//util/tracing",
@@ -47,7 +49,7 @@ go_test(
     data = glob(["testdata/**"]),
     embed = [":cardinality"],
     flaky = True,
-    shard_count = 29,
+    shard_count = 30,
     deps = [
         "//config",
         "//domain",

diff --git a/planner/cardinality/row_count_column.go b/planner/cardinality/row_count_column.go
@@ -27,6 +27,8 @@ func init() {
 	statistics.GetRowCountByColumnRanges = GetRowCountByColumnRanges
 	statistics.GetRowCountByIntColumnRanges = GetRowCountByIntColumnRanges
 	statistics.GetRowCountByIndexRanges = GetRowCountByIndexRanges
+	statistics.OutOfRangeEQSelectivity = outOfRangeEQSelectivity
+	statistics.GetEqualCondSelectivity = getEqualCondSelectivity
 }
 
 // GetRowCountByColumnRanges estimates the row count by a slice of Range.

diff --git a/planner/cardinality/selectivity.go b/planner/cardinality/selectivity.go
@@ -28,12 +28,19 @@ import (
 	"github.com/pingcap/tidb/sessionctx"
 	"github.com/pingcap/tidb/statistics"
 	"github.com/pingcap/tidb/types"
+	"github.com/pingcap/tidb/util/chunk"
+	"github.com/pingcap/tidb/util/codec"
+	"github.com/pingcap/tidb/util/collate"
 	"github.com/pingcap/tidb/util/logutil"
 	"github.com/pingcap/tidb/util/ranger"
 	"go.uber.org/zap"
 	"golang.org/x/exp/maps"
 )
 
+var (
+	outOfRangeBetweenRate int64 = 100
+)
+
 // Selectivity is a function calculate the selectivity of the expressions on the specified HistColl.
 // The definition of selectivity is (row count after filter / row count before filter).
 // And exprs must be CNF now, in other words, `exprs[0] and exprs[1] and ... and exprs[len - 1]`
@@ -350,7 +357,7 @@ OUTER:
 	// Try to cover remaining string matching functions by evaluating the expressions with TopN to estimate.
 	if ctx.GetSessionVars().EnableEvalTopNEstimationForStrMatch() {
 		for i, scalarCond := range notCoveredStrMatch {
-			ok, sel, err := coll.GetSelectivityByFilter(ctx, []expression.Expression{scalarCond})
+			ok, sel, err := GetSelectivityByFilter(ctx, coll, []expression.Expression{scalarCond})
 			if err != nil {
 				sc.AppendWarning(errors.New("Error when using TopN-assisted estimation: " + err.Error()))
 			}
@@ -365,7 +372,7 @@ OUTER:
 			}
 		}
 		for i, scalarCond := range notCoveredNegateStrMatch {
-			ok, sel, err := coll.GetSelectivityByFilter(ctx, []expression.Expression{scalarCond})
+			ok, sel, err := GetSelectivityByFilter(ctx, coll, []expression.Expression{scalarCond})
 			if err != nil {
 				sc.AppendWarning(errors.New("Error when using TopN-assisted estimation: " + err.Error()))
 			}
@@ -613,3 +620,325 @@ func getMaskAndRanges(ctx sessionctx.Context, exprs []expression.Expression, ran
 	}
 	return mask, ranges, false, nil
 }
+
+// GetSelectivityByFilter try to estimate selectivity of expressions by evaluate the expressions using TopN, Histogram buckets boundaries and NULL.
+// Currently, this method can only handle expressions involving a single column.
+func GetSelectivityByFilter(sctx sessionctx.Context, coll *statistics.HistColl, filters []expression.Expression) (ok bool, selectivity float64, err error) {
+	// 1. Make sure the expressions
+	//   (1) are safe to be evaluated here,
+	//   (2) involve only one column,
+	//   (3) and this column is not a "new collation" string column so that we're able to restore values from the stats.
+	for _, filter := range filters {
+		if expression.IsMutableEffectsExpr(filter) {
+			return false, 0, nil
+		}
+	}
+	if expression.ContainCorrelatedColumn(filters) {
+		return false, 0, nil
+	}
+	cols := expression.ExtractColumnsFromExpressions(nil, filters, nil)
+	if len(cols) != 1 {
+		return false, 0, nil
+	}
+	col := cols[0]
+	tp := col.RetType
+	if types.IsString(tp.GetType()) && collate.NewCollationEnabled() && !collate.IsBinCollation(tp.GetCollate()) {
+		return false, 0, nil
+	}
+
+	// 2. Get the available stats, make sure it's a ver2 stats and get the needed data structure from it.
+	isIndex, i := findAvailableStatsForCol(sctx, coll, col.UniqueID)
+	if i < 0 {
+		return false, 0, nil
+	}
+	var statsVer, nullCnt int64
+	var histTotalCnt, totalCnt float64
+	var topnTotalCnt uint64
+	var hist *statistics.Histogram
+	var topn *statistics.TopN
+	if isIndex {
+		stats := coll.Indices[i]
+		statsVer = stats.StatsVer
+		hist = &stats.Histogram
+		nullCnt = hist.NullCount
+		topn = stats.TopN
+	} else {
+		stats := coll.Columns[i]
+		statsVer = stats.StatsVer
+		hist = &stats.Histogram
+		nullCnt = hist.NullCount
+		topn = stats.TopN
+	}
+	// Only in stats ver2, we can assume that: TopN + Histogram + NULL == All data
+	if statsVer != statistics.Version2 {
+		return false, 0, nil
+	}
+	topnTotalCnt = topn.TotalCount()
+	histTotalCnt = hist.NotNullCount()
+	totalCnt = float64(topnTotalCnt) + histTotalCnt + float64(nullCnt)
+
+	var topNSel, histSel, nullSel float64
+
+	// Prepare for evaluation.
+
+	// For execution, we use Column.Index instead of Column.UniqueID to locate a column.
+	// We have only one column here, so we set it to 0.
+	originalIndex := col.Index
+	col.Index = 0
+	defer func() {
+		// Restore the original Index to avoid unexpected situation.
+		col.Index = originalIndex
+	}()
+	topNLen := 0
+	histBucketsLen := hist.Len()
+	if topn != nil {
+		topNLen = len(topn.TopN)
+	}
+	c := chunk.NewChunkWithCapacity([]*types.FieldType{tp}, max(1, topNLen))
+	selected := make([]bool, 0, max(histBucketsLen, topNLen))
+
+	// 3. Calculate the TopN part selectivity.
+	// This stage is considered as the core functionality of this method, errors in this stage would make this entire method fail.
+	var topNSelectedCnt uint64
+	if topn != nil {
+		for _, item := range topn.TopN {
+			_, val, err := codec.DecodeOne(item.Encoded)
+			if err != nil {
+				return false, 0, err
+			}
+			c.AppendDatum(0, &val)
+		}
+		selected, err = expression.VectorizedFilter(sctx, filters, chunk.NewIterator4Chunk(c), selected)
+		if err != nil {
+			return false, 0, err
+		}
+		for i, isTrue := range selected {
+			if isTrue {
+				topNSelectedCnt += topn.TopN[i].Count
+			}
+		}
+	}
+	topNSel = float64(topNSelectedCnt) / totalCnt
+
+	// 4. Calculate the Histogram part selectivity.
+	// The buckets upper bounds and the Bucket.Repeat are used like the TopN above.
+	// The buckets lower bounds are used as random samples and are regarded equally.
+	if hist != nil && histTotalCnt > 0 {
+		selected = selected[:0]
+		selected, err = expression.VectorizedFilter(sctx, filters, chunk.NewIterator4Chunk(hist.Bounds), selected)
+		if err != nil {
+			return false, 0, err
+		}
+		var bucketRepeatTotalCnt, bucketRepeatSelectedCnt, lowerBoundMatchCnt int64
+		for i := range hist.Buckets {
+			bucketRepeatTotalCnt += hist.Buckets[i].Repeat
+			if len(selected) < 2*i {
+				// This should not happen, but we add this check for safety.
+				break
+			}
+			if selected[2*i] {
+				lowerBoundMatchCnt++
+			}
+			if selected[2*i+1] {
+				bucketRepeatSelectedCnt += hist.Buckets[i].Repeat
+			}
+		}
+		var lowerBoundsRatio, upperBoundsRatio, lowerBoundsSel, upperBoundsSel float64
+		upperBoundsRatio = min(float64(bucketRepeatTotalCnt)/histTotalCnt, 1)
+		lowerBoundsRatio = 1 - upperBoundsRatio
+		if bucketRepeatTotalCnt > 0 {
+			upperBoundsSel = float64(bucketRepeatSelectedCnt) / float64(bucketRepeatTotalCnt)
+		}
+		lowerBoundsSel = float64(lowerBoundMatchCnt) / float64(histBucketsLen)
+		histSel = lowerBoundsSel*lowerBoundsRatio + upperBoundsSel*upperBoundsRatio
+		histSel *= histTotalCnt / totalCnt
+	}
+
+	// 5. Calculate the NULL part selectivity.
+	// Errors in this staged would be returned, but would not make this entire method fail.
+	c.Reset()
+	c.AppendNull(0)
+	selected = selected[:0]
+	selected, err = expression.VectorizedFilter(sctx, filters, chunk.NewIterator4Chunk(c), selected)
+	if err != nil || len(selected) != 1 || !selected[0] {
+		nullSel = 0
+	} else {
+		nullSel = float64(nullCnt) / totalCnt
+	}
+
+	// 6. Get the final result.
+	res := topNSel + histSel + nullSel
+	return true, res, err
+}
+
+func findAvailableStatsForCol(sctx sessionctx.Context, coll *statistics.HistColl, uniqueID int64) (isIndex bool, idx int64) {
+	// try to find available stats in column stats
+	if colStats, ok := coll.Columns[uniqueID]; ok && colStats != nil && !colStats.IsInvalid(sctx, coll.Pseudo) && colStats.IsFullLoad() {
+		return false, uniqueID
+	}
+	// try to find available stats in single column index stats (except for prefix index)
+	for idxStatsIdx, cols := range coll.Idx2ColumnIDs {
+		if len(cols) == 1 && cols[0] == uniqueID {
+			idxStats, ok := coll.Indices[idxStatsIdx]
+			if ok &&
+				idxStats.Info.Columns[0].Length == types.UnspecifiedLength &&
+				!idxStats.IsInvalid(sctx, coll.Pseudo) &&
+				idxStats.IsFullLoad() {
+				return true, idxStatsIdx
+			}
+		}
+	}
+	return false, -1
+}
+
+// getEqualCondSelectivity gets the selectivity of the equal conditions.
+func getEqualCondSelectivity(sctx sessionctx.Context, coll *statistics.HistColl, idx *statistics.Index, bytes []byte,
+	usedColsLen int, idxPointRange *ranger.Range) (result float64, err error) {
+	if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace {
+		debugtrace.EnterContextCommon(sctx)
+		defer func() {
+			var idxName string
+			if idx != nil && idx.Info != nil {
+				idxName = idx.Info.Name.O
+			}
+			debugtrace.RecordAnyValuesWithNames(
+				sctx,
+				"Index Name", idxName,
+				"Encoded", bytes,
+				"UsedColLen", usedColsLen,
+				"Range", idxPointRange.String(),
+				"Result", result,
+				"error", err,
+			)
+			debugtrace.LeaveContextCommon(sctx)
+		}()
+	}
+	coverAll := len(idx.Info.Columns) == usedColsLen
+	// In this case, the row count is at most 1.
+	if idx.Info.Unique && coverAll {
+		return 1.0 / idx.TotalRowCount(), nil
+	}
+	val := types.NewBytesDatum(bytes)
+	if idx.OutOfRangeOnIndex(val) {
+		// When the value is out of range, we could not found this value in the CM Sketch,
+		// so we use heuristic methods to estimate the selectivity.
+		if idx.NDV > 0 && coverAll {
+			return outOfRangeEQSelectivity(sctx, idx.NDV, coll.RealtimeCount, int64(idx.TotalRowCount())), nil
+		}
+		// The equal condition only uses prefix columns of the index.
+		colIDs := coll.Idx2ColumnIDs[idx.ID]
+		var ndv int64
+		for i, colID := range colIDs {
+			if i >= usedColsLen {
+				break
+			}
+			if col, ok := coll.Columns[colID]; ok {
+				ndv = max(ndv, col.Histogram.NDV)
+			}
+		}
+		return outOfRangeEQSelectivity(sctx, ndv, coll.RealtimeCount, int64(idx.TotalRowCount())), nil
+	}
+
+	minRowCount, crossValidSelectivity, err := crossValidationSelectivity(sctx, coll, idx, usedColsLen, idxPointRange)
+	if err != nil {
+		return 0, err
+	}
+
+	idxCount := float64(idx.QueryBytes(sctx, bytes))
+	if minRowCount < idxCount {
+		return crossValidSelectivity, nil
+	}
+	return idxCount / idx.TotalRowCount(), nil
+}
+
+// outOfRangeEQSelectivity estimates selectivities for out-of-range values.
+// It assumes all modifications are insertions and all new-inserted rows are uniformly distributed
+// and has the same distribution with analyzed rows, which means each unique value should have the
+// same number of rows(Tot/NDV) of it.
+// The input sctx is just for debug trace, you can pass nil safely if that's not needed.
+func outOfRangeEQSelectivity(sctx sessionctx.Context, ndv, realtimeRowCount, columnRowCount int64) (result float64) {
+	if sctx != nil && sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace {
+		debugtrace.EnterContextCommon(sctx)
+		defer func() {
+			debugtrace.RecordAnyValuesWithNames(sctx, "Result", result)
+			debugtrace.LeaveContextCommon(sctx)
+		}()
+	}
+	increaseRowCount := realtimeRowCount - columnRowCount
+	if increaseRowCount <= 0 {
+		return 0 // it must be 0 since the histogram contains the whole data
+	}
+	if ndv < outOfRangeBetweenRate {
+		ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV
+	}
+	selectivity := 1 / float64(ndv)
+	if selectivity*float64(columnRowCount) > float64(increaseRowCount) {
+		selectivity = float64(increaseRowCount) / float64(columnRowCount)
+	}
+	return selectivity
+}
+
+// crossValidationSelectivity gets the selectivity of multi-column equal conditions by cross validation.
+func crossValidationSelectivity(
+	sctx sessionctx.Context,
+	coll *statistics.HistColl,
+	idx *statistics.Index,
+	usedColsLen int,
+	idxPointRange *ranger.Range,
+) (
+	minRowCount float64,
+	crossValidationSelectivity float64,
+	err error,
+) {
+	if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace {
+		debugtrace.EnterContextCommon(sctx)
+		defer func() {
+			var idxName string
+			if idx != nil && idx.Info != nil {
+				idxName = idx.Info.Name.O
+			}
+			debugtrace.RecordAnyValuesWithNames(
+				sctx,
+				"Index Name", idxName,
+				"minRowCount", minRowCount,
+				"crossValidationSelectivity", crossValidationSelectivity,
+				"error", err,
+			)
+			debugtrace.LeaveContextCommon(sctx)
+		}()
+	}
+	minRowCount = math.MaxFloat64
+	cols := coll.Idx2ColumnIDs[idx.ID]
+	crossValidationSelectivity = 1.0
+	totalRowCount := idx.TotalRowCount()
+	for i, colID := range cols {
+		if i >= usedColsLen {
+			break
+		}
+		if col, ok := coll.Columns[colID]; ok {
+			if col.IsInvalid(sctx, coll.Pseudo) {
+				continue
+			}
+			// Since the column range is point range(LowVal is equal to HighVal), we need to set both LowExclude and HighExclude to false.
+			// Otherwise we would get 0.0 estRow from GetColumnRowCount.
+			rang := ranger.Range{
+				LowVal:      []types.Datum{idxPointRange.LowVal[i]},
+				LowExclude:  false,
+				HighVal:     []types.Datum{idxPointRange.HighVal[i]},
+				HighExclude: false,
+				Collators:   []collate.Collator{idxPointRange.Collators[i]},
+			}
+
+			rowCount, err := col.GetColumnRowCount(sctx, []*ranger.Range{&rang}, coll.RealtimeCount, coll.ModifyCount, col.IsHandle)
+			if err != nil {
+				return 0, 0, err
+			}
+			crossValidationSelectivity = crossValidationSelectivity * (rowCount / totalRowCount)
+
+			if rowCount < minRowCount {
+				minRowCount = rowCount
+			}
+		}
+	}
+	return minRowCount, crossValidationSelectivity, nil
+}