diff --git a/distsql/request_builder.go b/distsql/request_builder.go index f66a83a5721d3..4144d3dc20056 100644 --- a/distsql/request_builder.go +++ b/distsql/request_builder.go @@ -173,7 +173,6 @@ func TableRangesToKVRanges(tid int64, ranges []*ranger.Range, fb *statistics.Que if fb == nil || fb.Hist() == nil { return tableRangesToKVRangesWithoutSplit(tid, ranges) } - ranges = fb.Hist().SplitRange(ranges) krs := make([]kv.KeyRange, 0, len(ranges)) feedbackRanges := make([]*ranger.Range, 0, len(ranges)) for _, ran := range ranges { @@ -260,7 +259,7 @@ func IndexRangesToKVRanges(sc *stmtctx.StatementContext, tid, idxID int64, range feedbackRanges = append(feedbackRanges, &ranger.Range{LowVal: []types.Datum{types.NewBytesDatum(low)}, HighVal: []types.Datum{types.NewBytesDatum(high)}, LowExclude: false, HighExclude: true}) } - feedbackRanges = fb.Hist().SplitRange(feedbackRanges) + feedbackRanges = fb.Hist().SplitRange(sc, feedbackRanges, true) krs := make([]kv.KeyRange, 0, len(feedbackRanges)) for _, ran := range feedbackRanges { low, high := ran.LowVal[0].GetBytes(), ran.HighVal[0].GetBytes() diff --git a/executor/table_reader.go b/executor/table_reader.go index 2f1bf0b3569db..154fd12133e90 100644 --- a/executor/table_reader.go +++ b/executor/table_reader.go @@ -78,6 +78,10 @@ func (e *TableReaderExecutor) Open(ctx context.Context) error { } e.resultHandler = &tableResultHandler{} + if e.feedback != nil && e.feedback.Hist() != nil { + // EncodeInt don't need *statement.Context. + e.ranges = e.feedback.Hist().SplitRange(nil, e.ranges, false) + } firstPartRanges, secondPartRanges := splitRanges(e.ranges, e.keepOrder) firstResult, err := e.buildResp(ctx, firstPartRanges) if err != nil { diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go index 9b0cff40b499c..a3cebc8730662 100644 --- a/planner/core/exhaust_physical_plans.go +++ b/planner/core/exhaust_physical_plans.go @@ -459,7 +459,7 @@ func (p *LogicalJoin) constructInnerTableScan(ds *DataSource, pk *expression.Col var rowCount float64 pkHist, ok := ds.statisticTable.Columns[pk.ID] if ok && !ds.statisticTable.Pseudo { - rowCount = pkHist.AvgCountPerValue(ds.statisticTable.Count) + rowCount = pkHist.AvgCountPerNotNullValue(ds.statisticTable.Count) } else { rowCount = ds.statisticTable.PseudoAvgCountPerValue() } @@ -506,7 +506,7 @@ func (p *LogicalJoin) constructInnerIndexScan(ds *DataSource, idx *model.IndexIn var rowCount float64 idxHist, ok := ds.statisticTable.Indices[idx.ID] if ok && !ds.statisticTable.Pseudo { - rowCount = idxHist.AvgCountPerValue(ds.statisticTable.Count) + rowCount = idxHist.AvgCountPerNotNullValue(ds.statisticTable.Count) } else { rowCount = ds.statisticTable.PseudoAvgCountPerValue() } diff --git a/planner/core/logical_plans.go b/planner/core/logical_plans.go index 1a625a93ad709..cee54de15f6b9 100644 --- a/planner/core/logical_plans.go +++ b/planner/core/logical_plans.go @@ -450,7 +450,7 @@ func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) { if corColInAccessConds { idxHist, ok := ds.stats.HistColl.Indices[path.index.ID] if ok && !ds.stats.HistColl.Pseudo { - path.countAfterAccess = idxHist.AvgCountPerValue(ds.statisticTable.Count) + path.countAfterAccess = idxHist.AvgCountPerNotNullValue(ds.statisticTable.Count) } else { path.countAfterAccess = ds.statisticTable.PseudoAvgCountPerValue() } @@ -461,7 +461,7 @@ func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) { path.countAfterAccess = math.Min(ds.stats.RowCount/selectionFactor, float64(ds.statisticTable.Count)) } if path.indexFilters != nil { - selectivity, err := ds.stats.HistColl.Selectivity(ds.ctx, path.indexFilters) + selectivity, _, err := ds.stats.HistColl.Selectivity(ds.ctx, path.indexFilters) if err != nil { log.Warnf("An error happened: %v, we have to use the default selectivity", err.Error()) selectivity = selectionFactor diff --git a/planner/core/rule_column_pruning.go b/planner/core/rule_column_pruning.go index bf2d86a916575..170a8b0241502 100644 --- a/planner/core/rule_column_pruning.go +++ b/planner/core/rule_column_pruning.go @@ -17,6 +17,7 @@ import ( "fmt" "github.com/pingcap/parser/ast" "github.com/pingcap/parser/model" + "github.com/pingcap/parser/mysql" "github.com/pingcap/tidb/expression" "github.com/pingcap/tidb/infoschema" ) @@ -155,7 +156,15 @@ func (p *LogicalUnionScan) PruneColumns(parentUsedCols []*expression.Column) { // PruneColumns implements LogicalPlan interface. func (ds *DataSource) PruneColumns(parentUsedCols []*expression.Column) { used := getUsedList(parentUsedCols, ds.schema) + var ( + handleCol *expression.Column + handleColInfo *model.ColumnInfo + ) for i := len(used) - 1; i >= 0; i-- { + if ds.tableInfo.PKIsHandle && mysql.HasPriKeyFlag(ds.Columns[i].Flag) { + handleCol = ds.schema.Columns[i] + handleColInfo = ds.Columns[i] + } if !used[i] { ds.schema.Columns = append(ds.schema.Columns[:i], ds.schema.Columns[i+1:]...) ds.Columns = append(ds.Columns[:i], ds.Columns[i+1:]...) @@ -169,8 +178,12 @@ func (ds *DataSource) PruneColumns(parentUsedCols []*expression.Column) { // For SQL like `select 1 from t`, tikv's response will be empty if no column is in schema. // So we'll force to push one if schema doesn't have any column. if ds.schema.Len() == 0 && !infoschema.IsMemoryDB(ds.DBName.L) { - ds.Columns = append(ds.Columns, model.NewExtraHandleColInfo()) - ds.schema.Append(ds.newExtraHandleSchemaCol()) + if handleCol == nil { + handleCol = ds.newExtraHandleSchemaCol() + handleColInfo = model.NewExtraHandleColInfo() + } + ds.Columns = append(ds.Columns, handleColInfo) + ds.schema.Append(handleCol) } } diff --git a/planner/core/stats.go b/planner/core/stats.go index 32972f6132a33..eadafd4961541 100644 --- a/planner/core/stats.go +++ b/planner/core/stats.go @@ -18,6 +18,7 @@ import ( "github.com/pingcap/tidb/expression" "github.com/pingcap/tidb/planner/property" + "github.com/pingcap/tidb/statistics" log "github.com/sirupsen/logrus" ) @@ -74,7 +75,7 @@ func (p *baseLogicalPlan) DeriveStats(childStats []*property.StatsInfo) (*proper return profile, nil } -func (ds *DataSource) getStatsByFilter(conds expression.CNFExprs) *property.StatsInfo { +func (ds *DataSource) getStatsByFilter(conds expression.CNFExprs) (*property.StatsInfo, *statistics.HistColl) { profile := &property.StatsInfo{ RowCount: float64(ds.statisticTable.Count), Cardinality: make([]float64, len(ds.Columns)), @@ -91,12 +92,16 @@ func (ds *DataSource) getStatsByFilter(conds expression.CNFExprs) *property.Stat } } ds.stats = profile - selectivity, err := profile.HistColl.Selectivity(ds.ctx, conds) + selectivity, nodes, err := profile.HistColl.Selectivity(ds.ctx, conds) if err != nil { log.Warnf("An error happened: %v, we have to use the default selectivity", err.Error()) selectivity = selectionFactor } - return profile.Scale(selectivity) + if ds.ctx.GetSessionVars().OptimizerSelectivityLevel >= 1 && ds.stats.HistColl != nil { + finalHist := ds.stats.HistColl.NewHistCollBySelectivity(ds.ctx.GetSessionVars().StmtCtx, nodes) + return profile, finalHist + } + return profile.Scale(selectivity), nil } // DeriveStats implement LogicalPlan DeriveStats interface. @@ -105,7 +110,8 @@ func (ds *DataSource) DeriveStats(childStats []*property.StatsInfo) (*property.S for i, expr := range ds.pushedDownConds { ds.pushedDownConds[i] = expression.PushDownNot(nil, expr, false) } - ds.stats = ds.getStatsByFilter(ds.pushedDownConds) + var finalHist *statistics.HistColl + ds.stats, finalHist = ds.getStatsByFilter(ds.pushedDownConds) for _, path := range ds.possibleAccessPaths { if path.isTablePath { noIntervalRanges, err := ds.deriveTablePathStats(path) @@ -131,6 +137,9 @@ func (ds *DataSource) DeriveStats(childStats []*property.StatsInfo) (*property.S break } } + if ds.ctx.GetSessionVars().OptimizerSelectivityLevel >= 1 { + ds.stats.HistColl = finalHist + } return ds.stats, nil } diff --git a/planner/property/stats_info.go b/planner/property/stats_info.go index 67c112c20d256..8881f044d7ba5 100644 --- a/planner/property/stats_info.go +++ b/planner/property/stats_info.go @@ -24,7 +24,7 @@ type StatsInfo struct { RowCount float64 Cardinality []float64 - HistColl statistics.HistColl + HistColl *statistics.HistColl // UsePseudoStats indicates whether the StatsInfo is calculated using the // pseudo statistics on a table. UsePseudoStats bool diff --git a/statistics/feedback.go b/statistics/feedback.go index 392efd3c9b9a0..aedf265db3186 100644 --- a/statistics/feedback.go +++ b/statistics/feedback.go @@ -294,7 +294,7 @@ func buildBucketFeedback(h *Histogram, feedback *QueryFeedback) (map[int]*Bucket } total := 0 sc := &stmtctx.StatementContext{TimeZone: time.UTC} - min, max := getMinValue(h.tp), getMaxValue(h.tp) + min, max := getMinValue(h.Tp), getMaxValue(h.Tp) for _, fb := range feedback.feedback { skip, err := fb.adjustFeedbackBoundaries(sc, &min, &max) if err != nil { @@ -606,7 +606,7 @@ func UpdateCMSketch(c *CMSketch, eqFeedbacks []feedback) *CMSketch { } func buildNewHistogram(h *Histogram, buckets []bucket) *Histogram { - hist := NewHistogram(h.ID, h.NDV, h.NullCount, h.LastUpdateVersion, h.tp, len(buckets), h.TotColSize) + hist := NewHistogram(h.ID, h.NDV, h.NullCount, h.LastUpdateVersion, h.Tp, len(buckets), h.TotColSize) preCount := int64(0) for _, bkt := range buckets { hist.AppendBucket(bkt.lower, bkt.upper, bkt.count+preCount, bkt.repeat) @@ -622,7 +622,7 @@ type queryFeedback struct { HashValues []uint64 IndexRanges [][]byte // Counts is the number of scan keys in each range. It first stores the count for `IntRanges`, `IndexRanges` or `ColumnRanges`. - // After that, it stores the ranges for `HashValues`. + // After that, it stores the Ranges for `HashValues`. Counts []int64 ColumnRanges [][]byte } @@ -814,7 +814,7 @@ func (q *QueryFeedback) recalculateExpectCount(h *Handle) error { if tablePseudo == false { return nil } - isIndex := q.hist.tp.Tp == mysql.TypeBlob + isIndex := q.hist.Tp.Tp == mysql.TypeBlob id := q.hist.ID if isIndex && (t.Indices[id] == nil || t.Indices[id].NotAccurate() == false) { return nil @@ -1056,7 +1056,7 @@ func dumpFeedbackForIndex(h *Handle, q *QueryFeedback, t *Table) error { equalityCount, rangeCount = getNewCountForIndex(equalityCount, rangeCount, float64(t.Count), float64(q.feedback[i].count)) value := types.NewBytesDatum(bytes) q.feedback[i] = feedback{lower: &value, upper: &value, count: int64(equalityCount)} - err = rangeFB.dumpRangeFeedback(h, &rang, rangeCount) + err = rangeFB.dumpRangeFeedback(sc, h, &rang, rangeCount) if err != nil { log.Debug("dump range feedback failed:", err) continue @@ -1065,9 +1065,8 @@ func dumpFeedbackForIndex(h *Handle, q *QueryFeedback, t *Table) error { return errors.Trace(h.dumpFeedbackToKV(q)) } -func (q *QueryFeedback) dumpRangeFeedback(h *Handle, ran *ranger.Range, rangeCount float64) error { +func (q *QueryFeedback) dumpRangeFeedback(sc *stmtctx.StatementContext, h *Handle, ran *ranger.Range, rangeCount float64) error { if q.tp == indexType { - sc := &stmtctx.StatementContext{TimeZone: time.UTC} lower, err := codec.EncodeKey(sc, nil, ran.LowVal[0]) if err != nil { return errors.Trace(err) @@ -1079,17 +1078,17 @@ func (q *QueryFeedback) dumpRangeFeedback(h *Handle, ran *ranger.Range, rangeCou ran.LowVal[0].SetBytes(lower) ran.HighVal[0].SetBytes(upper) } else { - if !supportColumnType(q.hist.tp) { + if !supportColumnType(q.hist.Tp) { return nil } if ran.LowVal[0].Kind() == types.KindMinNotNull { - ran.LowVal[0] = getMinValue(q.hist.tp) + ran.LowVal[0] = getMinValue(q.hist.Tp) } if ran.HighVal[0].Kind() == types.KindMaxValue { - ran.HighVal[0] = getMaxValue(q.hist.tp) + ran.HighVal[0] = getMaxValue(q.hist.Tp) } } - ranges := q.hist.SplitRange([]*ranger.Range{ran}) + ranges := q.hist.SplitRange(sc, []*ranger.Range{ran}, q.tp == indexType) counts := make([]float64, 0, len(ranges)) sum := 0.0 for _, r := range ranges { diff --git a/statistics/feedback_test.go b/statistics/feedback_test.go index da19fc233342d..5c71aa0013da2 100644 --- a/statistics/feedback_test.go +++ b/statistics/feedback_test.go @@ -202,7 +202,7 @@ func (s *testFeedbackSuite) TestMergeBuckets(c *C) { } defaultBucketCount = t.bucketCount bkts = mergeBuckets(bkts, t.isNewBuckets, float64(totalCount)) - result := buildNewHistogram(&Histogram{tp: types.NewFieldType(mysql.TypeLong)}, bkts).ToString(0) + result := buildNewHistogram(&Histogram{Tp: types.NewFieldType(mysql.TypeLong)}, bkts).ToString(0) c.Assert(result, Equals, t.result) } } @@ -228,7 +228,7 @@ func (s *testFeedbackSuite) TestFeedbackEncoding(c *C) { } c.Assert(q.Equal(rq), IsTrue) - hist.tp = types.NewFieldType(mysql.TypeBlob) + hist.Tp = types.NewFieldType(mysql.TypeBlob) q = &QueryFeedback{hist: hist} q.feedback = append(q.feedback, feedback{encodeInt(0), encodeInt(3), 1, 0}) q.feedback = append(q.feedback, feedback{encodeInt(0), encodeInt(1), 1, 0}) diff --git a/statistics/histogram.go b/statistics/histogram.go index 43ba719424041..860c8a4d476fd 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -34,6 +34,7 @@ import ( "github.com/pingcap/tidb/util/ranger" "github.com/pingcap/tidb/util/sqlexec" "github.com/pingcap/tipb/go-tipb" + log "github.com/sirupsen/logrus" ) // Histogram represents statistics for a column or index. @@ -44,7 +45,7 @@ type Histogram struct { // LastUpdateVersion is the version that this histogram updated last time. LastUpdateVersion uint64 - tp *types.FieldType + Tp *types.FieldType // Histogram elements. // @@ -84,7 +85,7 @@ func NewHistogram(id, ndv, nullCount int64, version uint64, tp *types.FieldType, NDV: ndv, NullCount: nullCount, LastUpdateVersion: version, - tp: tp, + Tp: tp, Bounds: chunk.NewChunkWithCapacity([]*types.FieldType{tp}, 2*bucketSize), Buckets: make([]Bucket, 0, bucketSize), TotColSize: totColSize, @@ -93,13 +94,13 @@ func NewHistogram(id, ndv, nullCount int64, version uint64, tp *types.FieldType, // GetLower gets the lower bound of bucket `idx`. func (hg *Histogram) GetLower(idx int) *types.Datum { - d := hg.Bounds.GetRow(2*idx).GetDatum(0, hg.tp) + d := hg.Bounds.GetRow(2*idx).GetDatum(0, hg.Tp) return &d } // GetUpper gets the upper bound of bucket `idx`. func (hg *Histogram) GetUpper(idx int) *types.Datum { - d := hg.Bounds.GetRow(2*idx+1).GetDatum(0, hg.tp) + d := hg.Bounds.GetRow(2*idx+1).GetDatum(0, hg.Tp) return &d } @@ -108,7 +109,7 @@ func (c *Column) AvgColSize(count int64) float64 { if count == 0 { return 0 } - switch c.Histogram.tp.Tp { + switch c.Histogram.Tp.Tp { case mysql.TypeFloat: return 4 case mysql.TypeTiny, mysql.TypeShort, mysql.TypeInt24, mysql.TypeLong, mysql.TypeLonglong, @@ -138,11 +139,11 @@ func (hg *Histogram) updateLastBucket(upper *types.Datum, count, repeat int64) { hg.Buckets[len-1] = Bucket{Count: count, Repeat: repeat} } -// DecodeTo decodes the histogram bucket values into `tp`. +// DecodeTo decodes the histogram bucket values into `Tp`. func (hg *Histogram) DecodeTo(tp *types.FieldType, timeZone *time.Location) error { oldIter := chunk.NewIterator4Chunk(hg.Bounds) hg.Bounds = chunk.NewChunkWithCapacity([]*types.FieldType{tp}, oldIter.Len()) - hg.tp = tp + hg.Tp = tp for row := oldIter.Begin(); row != oldIter.End(); row = oldIter.Next() { datum, err := tablecodec.DecodeColumnValue(row.GetBytes(0), tp, timeZone) if err != nil { @@ -153,12 +154,12 @@ func (hg *Histogram) DecodeTo(tp *types.FieldType, timeZone *time.Location) erro return nil } -// ConvertTo converts the histogram bucket values into `tp`. +// ConvertTo converts the histogram bucket values into `Tp`. func (hg *Histogram) ConvertTo(sc *stmtctx.StatementContext, tp *types.FieldType) (*Histogram, error) { hist := NewHistogram(hg.ID, hg.NDV, hg.NullCount, hg.LastUpdateVersion, tp, hg.Len(), hg.TotColSize) iter := chunk.NewIterator4Chunk(hg.Bounds) for row := iter.Begin(); row != iter.End(); row = iter.Next() { - d := row.GetDatum(0, hg.tp) + d := row.GetDatum(0, hg.Tp) d, err := d.ConvertTo(sc, tp) if err != nil { return nil, errors.Trace(err) @@ -390,14 +391,14 @@ func (hg *Histogram) equalRowCount(value types.Datum) float64 { if match { return float64(hg.Buckets[index/2].Repeat) } - return hg.totalRowCount() / float64(hg.NDV) + return hg.notNullCount() / float64(hg.NDV) } if match { - cmp := chunk.GetCompareFunc(hg.tp) + cmp := chunk.GetCompareFunc(hg.Tp) if cmp(hg.Bounds.GetRow(index), 0, hg.Bounds.GetRow(index+1), 0) == 0 { return float64(hg.Buckets[index/2].Repeat) } - return hg.totalRowCount() / float64(hg.NDV) + return hg.notNullCount() / float64(hg.NDV) } return 0 } @@ -467,16 +468,20 @@ func (hg *Histogram) betweenRowCount(a, b types.Datum) float64 { } func (hg *Histogram) totalRowCount() float64 { + return hg.notNullCount() + float64(hg.NullCount) +} + +func (hg *Histogram) notNullCount() float64 { if hg.Len() == 0 { - return float64(hg.NullCount) + return 0 } - return float64(hg.Buckets[hg.Len()-1].Count + hg.NullCount) + return float64(hg.Buckets[hg.Len()-1].Count) } // mergeBuckets is used to merge every two neighbor buckets. func (hg *Histogram) mergeBuckets(bucketIdx int) { curBuck := 0 - c := chunk.NewChunkWithCapacity([]*types.FieldType{hg.tp}, bucketIdx) + c := chunk.NewChunkWithCapacity([]*types.FieldType{hg.Tp}, bucketIdx) for i := 0; i+1 <= bucketIdx; i += 2 { hg.Buckets[curBuck] = hg.Buckets[i+1] c.AppendDatum(0, hg.GetLower(i)) @@ -506,12 +511,20 @@ func (hg *Histogram) getIncreaseFactor(totalCount int64) float64 { // validRange checks if the range is valid, it is used by `SplitRange` to remove the invalid range, // the possible types of range are index key range and handle key range. -func validRange(ran *ranger.Range) bool { +func validRange(sc *stmtctx.StatementContext, ran *ranger.Range, encoded bool) bool { var low, high []byte - if ran.LowVal[0].Kind() == types.KindBytes { + if encoded { low, high = ran.LowVal[0].GetBytes(), ran.HighVal[0].GetBytes() } else { - low, high = codec.EncodeInt(nil, ran.LowVal[0].GetInt64()), codec.EncodeInt(nil, ran.HighVal[0].GetInt64()) + var err error + low, err = codec.EncodeKey(sc, nil, ran.LowVal[0]) + if err != nil { + return false + } + high, err = codec.EncodeKey(sc, nil, ran.HighVal[0]) + if err != nil { + return false + } } if ran.LowExclude { low = kv.Key(low).PrefixNext() @@ -523,9 +536,9 @@ func validRange(ran *ranger.Range) bool { } // SplitRange splits the range according to the histogram upper bound. Note that we treat last bucket's upper bound -// as inf, so all the split ranges will totally fall in one of the (-inf, u(0)], (u(0), u(1)],...(u(n-3), u(n-2)], +// as inf, so all the split Ranges will totally fall in one of the (-inf, u(0)], (u(0), u(1)],...(u(n-3), u(n-2)], // (u(n-2), +inf), where n is the number of buckets, u(i) is the i-th bucket's upper bound. -func (hg *Histogram) SplitRange(ranges []*ranger.Range) []*ranger.Range { +func (hg *Histogram) SplitRange(sc *stmtctx.StatementContext, ranges []*ranger.Range, encoded bool) []*ranger.Range { split := make([]*ranger.Range, 0, len(ranges)) for len(ranges) > 0 { // Find the last bound that greater or equal to the LowVal. @@ -561,7 +574,7 @@ func (hg *Histogram) SplitRange(ranges []*ranger.Range) []*ranger.Range { // Split according to the upper bound. cmp := chunk.Compare(upperBound, 0, &ranges[0].LowVal[0]) if cmp > 0 || (cmp == 0 && !ranges[0].LowExclude) { - upper := upperBound.GetDatum(0, hg.tp) + upper := upperBound.GetDatum(0, hg.Tp) split = append(split, &ranger.Range{ LowExclude: ranges[0].LowExclude, LowVal: []types.Datum{ranges[0].LowVal[0]}, @@ -569,7 +582,7 @@ func (hg *Histogram) SplitRange(ranges []*ranger.Range) []*ranger.Range { HighExclude: false}) ranges[0].LowVal[0] = upper ranges[0].LowExclude = true - if !validRange(ranges[0]) { + if !validRange(sc, ranges[0], encoded) { ranges = ranges[1:] } } @@ -617,13 +630,13 @@ func HistogramFromProto(protoHg *tipb.Histogram) *Histogram { func (hg *Histogram) popFirstBucket() { hg.Buckets = hg.Buckets[1:] - c := chunk.NewChunkWithCapacity([]*types.FieldType{hg.tp, hg.tp}, hg.Bounds.NumRows()-2) + c := chunk.NewChunkWithCapacity([]*types.FieldType{hg.Tp, hg.Tp}, hg.Bounds.NumRows()-2) c.Append(hg.Bounds, 2, hg.Bounds.NumRows()) hg.Bounds = c } func (hg *Histogram) isIndexHist() bool { - return hg.tp.Tp == mysql.TypeBlob + return hg.Tp.Tp == mysql.TypeBlob } // MergeHistograms merges two histograms. @@ -677,13 +690,15 @@ func MergeHistograms(sc *stmtctx.StatementContext, lh *Histogram, rh *Histogram, return lh, nil } -// AvgCountPerValue gets the average row count per value by the data of histogram. -func (hg *Histogram) AvgCountPerValue(totalCount int64) float64 { - curNDV := float64(hg.NDV) * hg.getIncreaseFactor(totalCount) +// AvgCountPerNotNullValue gets the average row count per value by the data of histogram. +func (hg *Histogram) AvgCountPerNotNullValue(totalCount int64) float64 { + factor := hg.getIncreaseFactor(totalCount) + totalNotNull := hg.notNullCount() * factor + curNDV := float64(hg.NDV) * factor if curNDV == 0 { curNDV = 1 } - return float64(totalCount) / curNDV + return totalNotNull / curNDV } func (hg *Histogram) outOfRange(val types.Datum) bool { @@ -866,6 +881,168 @@ func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*range return totalCount, nil } +type countByRangeFunc = func(*stmtctx.StatementContext, int64, []*ranger.Range) (float64, error) + +// newHistogramBySelectivity fulfills the content of new histogram by the given selectivity result. +// TODO: Datum is not efficient, try to avoid using it here. +// Also, there're redundant calculation with Selectivity(). We need to reduce it too. +func newHistogramBySelectivity(sc *stmtctx.StatementContext, histID int64, oldHist, newHist *Histogram, ranges []*ranger.Range, cntByRangeFunc countByRangeFunc) error { + cntPerVal := int64(oldHist.AvgCountPerNotNullValue(int64(oldHist.totalRowCount()))) + var totCnt int64 = 0 + for boundIdx, ranIdx, highRangeIdx := 0, 0, 0; boundIdx < oldHist.Bounds.NumRows() && ranIdx < len(ranges); boundIdx, ranIdx = boundIdx+2, highRangeIdx { + for highRangeIdx < len(ranges) && chunk.Compare(oldHist.Bounds.GetRow(boundIdx+1), 0, &ranges[highRangeIdx].HighVal[0]) >= 0 { + highRangeIdx++ + } + if boundIdx+2 >= oldHist.Bounds.NumRows() && highRangeIdx < len(ranges) && ranges[highRangeIdx].HighVal[0].Kind() == types.KindMaxValue { + highRangeIdx++ + } + if ranIdx == highRangeIdx { + continue + } + cnt, err := cntByRangeFunc(sc, histID, ranges[ranIdx:highRangeIdx]) + // This should not happen. + if err != nil { + return err + } + if cnt == 0 { + continue + } + if int64(cnt) > oldHist.bucketCount(boundIdx/2) { + cnt = float64(oldHist.bucketCount(boundIdx / 2)) + } + newHist.Bounds.AppendRow(oldHist.Bounds.GetRow(boundIdx)) + newHist.Bounds.AppendRow(oldHist.Bounds.GetRow(boundIdx + 1)) + totCnt += int64(cnt) + bkt := Bucket{Count: totCnt} + if chunk.Compare(oldHist.Bounds.GetRow(boundIdx+1), 0, &ranges[highRangeIdx-1].HighVal[0]) == 0 && !ranges[highRangeIdx-1].HighExclude { + bkt.Repeat = cntPerVal + } + newHist.Buckets = append(newHist.Buckets, bkt) + switch newHist.Tp.EvalType() { + case types.ETString, types.ETDecimal, types.ETDatetime, types.ETTimestamp: + newHist.scalars = append(newHist.scalars, oldHist.scalars[boundIdx/2]) + } + } + return nil +} + +func (idx *Index) newIndexBySelectivity(sc *stmtctx.StatementContext, statsNode *StatsNode) (*Index, error) { + var ( + ranLowEncode, ranHighEncode []byte + err error + ) + newIndexHist := &Index{Info: idx.Info, statsVer: idx.statsVer, CMSketch: idx.CMSketch} + newIndexHist.Histogram = *NewHistogram(idx.ID, int64(float64(idx.NDV)*statsNode.Selectivity), 0, 0, types.NewFieldType(mysql.TypeBlob), chunk.InitialCapacity, 0) + + lowBucketIdx, highBucketIdx := 0, 0 + var totCnt int64 = 0 + + // Bucket bound of index is encoded one, so we need to decode it if we want to calculate the fraction accurately. + // TODO: enhance its calculation. + // Now just remove the bucket that no range fell in. + for _, ran := range statsNode.Ranges { + lowBucketIdx = highBucketIdx + ranLowEncode, ranHighEncode, err = ran.Encode(sc, ranLowEncode, ranHighEncode) + if err != nil { + return nil, err + } + for ; highBucketIdx < idx.Len(); highBucketIdx++ { + // Encoded value can only go to its next quickly. So ranHighEncode is actually range.HighVal's PrefixNext value. + // So the Bound should also go to its PrefixNext. + bucketLowerEncoded := idx.Bounds.GetRow(highBucketIdx * 2).GetBytes(0) + if bytes.Compare(ranHighEncode, kv.Key(bucketLowerEncoded).PrefixNext()) < 0 { + break + } + } + for ; lowBucketIdx < highBucketIdx; lowBucketIdx++ { + bucketUpperEncoded := idx.Bounds.GetRow(lowBucketIdx*2 + 1).GetBytes(0) + if bytes.Compare(ranLowEncode, bucketUpperEncoded) <= 0 { + break + } + } + if lowBucketIdx >= idx.Len() { + break + } + for i := lowBucketIdx; i < highBucketIdx; i++ { + newIndexHist.Bounds.AppendRow(idx.Bounds.GetRow(i * 2)) + newIndexHist.Bounds.AppendRow(idx.Bounds.GetRow(i*2 + 1)) + totCnt += idx.bucketCount(i) + newIndexHist.Buckets = append(newIndexHist.Buckets, Bucket{Repeat: idx.Buckets[i].Repeat, Count: totCnt}) + newIndexHist.scalars = append(newIndexHist.scalars, idx.scalars[i]) + } + } + return newIndexHist, nil +} + +// NewHistCollBySelectivity creates new HistColl by the given statsNodes. +func (coll *HistColl) NewHistCollBySelectivity(sc *stmtctx.StatementContext, statsNodes []*StatsNode) *HistColl { + newColl := &HistColl{ + Columns: make(map[int64]*Column), + Indices: make(map[int64]*Index), + Idx2ColumnIDs: coll.Idx2ColumnIDs, + ColID2IdxID: coll.ColID2IdxID, + Count: coll.Count, + } + for _, node := range statsNodes { + if node.Tp == indexType { + idxHist, ok := coll.Indices[node.ID] + if !ok { + continue + } + newIdxHist, err := idxHist.newIndexBySelectivity(sc, node) + if err != nil { + log.Warnf("[Histogram-in-plan]: error happened when calculating row count: %v, failed to build histogram for index %v of table %v", err, idxHist.Info.Name, idxHist.Info.Table) + continue + } + newColl.Indices[node.ID] = newIdxHist + continue + } + oldCol, ok := coll.Columns[node.ID] + if !ok { + continue + } + newCol := &Column{Info: oldCol.Info, isHandle: oldCol.isHandle, CMSketch: oldCol.CMSketch} + newCol.Histogram = *NewHistogram(oldCol.ID, int64(float64(oldCol.NDV)*node.Selectivity), 0, 0, oldCol.Tp, chunk.InitialCapacity, 0) + var err error + splitRanges := oldCol.Histogram.SplitRange(sc, node.Ranges, false) + // Deal with some corner case. + if len(splitRanges) > 0 { + // Deal with NULL values. + if splitRanges[0].LowVal[0].IsNull() { + newCol.NullCount = oldCol.NullCount + if splitRanges[0].HighVal[0].IsNull() { + splitRanges = splitRanges[1:] + } else { + splitRanges[0].LowVal[0].SetMinNotNull() + } + } + } + if oldCol.isHandle { + err = newHistogramBySelectivity(sc, node.ID, &oldCol.Histogram, &newCol.Histogram, splitRanges, coll.GetRowCountByIntColumnRanges) + } else { + err = newHistogramBySelectivity(sc, node.ID, &oldCol.Histogram, &newCol.Histogram, splitRanges, coll.GetRowCountByColumnRanges) + } + if err != nil { + log.Warnf("[Histogram-in-plan]: error happened when calculating row count: %v", err) + continue + } + newColl.Columns[node.ID] = newCol + } + for id, idx := range coll.Indices { + _, ok := newColl.Indices[id] + if !ok { + newColl.Indices[id] = idx + } + } + for id, col := range coll.Columns { + _, ok := newColl.Columns[id] + if !ok { + newColl.Columns[id] = col + } + } + return newColl +} + func (idx *Index) outOfRange(val types.Datum) bool { if idx.Histogram.Len() == 0 { return true diff --git a/statistics/histogram_test.go b/statistics/histogram_test.go new file mode 100644 index 0000000000000..73b7daaa64ecc --- /dev/null +++ b/statistics/histogram_test.go @@ -0,0 +1,122 @@ +// Copyright 2018 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package statistics + +import ( + . "github.com/pingcap/check" + "github.com/pingcap/parser/model" + "github.com/pingcap/parser/mysql" + "github.com/pingcap/tidb/types" + "github.com/pingcap/tidb/util/chunk" + "github.com/pingcap/tidb/util/codec" + "github.com/pingcap/tidb/util/mock" + "github.com/pingcap/tidb/util/ranger" +) + +func (s *testStatisticsSuite) TestNewHistogramBySelectivity(c *C) { + coll := &HistColl{ + Count: 330, + Columns: make(map[int64]*Column), + Indices: make(map[int64]*Index), + } + ctx := mock.NewContext() + sc := ctx.GetSessionVars().StmtCtx + intCol := &Column{} + intCol.Histogram = *NewHistogram(1, 30, 30, 0, types.NewFieldType(mysql.TypeLonglong), chunk.InitialCapacity, 0) + intCol.isHandle = true + for i := 0; i < 10; i++ { + intCol.Bounds.AppendInt64(0, int64(i*3)) + intCol.Bounds.AppendInt64(0, int64(i*3+2)) + intCol.Buckets = append(intCol.Buckets, Bucket{Repeat: 10, Count: int64(30*i + 30)}) + } + coll.Columns[1] = intCol + node := &StatsNode{ID: 1, Tp: pkType, Selectivity: 0.56} + node.Ranges = append(node.Ranges, &ranger.Range{LowVal: types.MakeDatums(nil), HighVal: types.MakeDatums(nil)}) + node.Ranges = append(node.Ranges, &ranger.Range{LowVal: []types.Datum{types.MinNotNullDatum()}, HighVal: types.MakeDatums(2)}) + node.Ranges = append(node.Ranges, &ranger.Range{LowVal: types.MakeDatums(5), HighVal: types.MakeDatums(6)}) + node.Ranges = append(node.Ranges, &ranger.Range{LowVal: types.MakeDatums(8), HighVal: types.MakeDatums(10)}) + node.Ranges = append(node.Ranges, &ranger.Range{LowVal: types.MakeDatums(13), HighVal: types.MakeDatums(13)}) + node.Ranges = append(node.Ranges, &ranger.Range{LowVal: types.MakeDatums(25), HighVal: []types.Datum{types.MaxValueDatum()}}) + intColResult := `column:1 ndv:16 totColSize:0 +num: 30 lower_bound: 0 upper_bound: 2 repeats: 10 +num: 10 lower_bound: 3 upper_bound: 5 repeats: 10 +num: 20 lower_bound: 6 upper_bound: 8 repeats: 10 +num: 20 lower_bound: 9 upper_bound: 11 repeats: 0 +num: 10 lower_bound: 12 upper_bound: 14 repeats: 0 +num: 20 lower_bound: 24 upper_bound: 26 repeats: 10 +num: 30 lower_bound: 27 upper_bound: 29 repeats: 0` + + stringCol := &Column{} + stringCol.Histogram = *NewHistogram(2, 15, 30, 0, types.NewFieldType(mysql.TypeString), chunk.InitialCapacity, 0) + stringCol.Bounds.AppendString(0, "a") + stringCol.Bounds.AppendString(0, "aaaabbbb") + stringCol.Buckets = append(stringCol.Buckets, Bucket{Repeat: 10, Count: 60}) + stringCol.Bounds.AppendString(0, "bbbb") + stringCol.Bounds.AppendString(0, "fdsfdsfds") + stringCol.Buckets = append(stringCol.Buckets, Bucket{Repeat: 10, Count: 120}) + stringCol.Bounds.AppendString(0, "kkkkk") + stringCol.Bounds.AppendString(0, "ooooo") + stringCol.Buckets = append(stringCol.Buckets, Bucket{Repeat: 10, Count: 180}) + stringCol.Bounds.AppendString(0, "oooooo") + stringCol.Bounds.AppendString(0, "sssss") + stringCol.Buckets = append(stringCol.Buckets, Bucket{Repeat: 10, Count: 240}) + stringCol.Bounds.AppendString(0, "ssssssu") + stringCol.Bounds.AppendString(0, "yyyyy") + stringCol.Buckets = append(stringCol.Buckets, Bucket{Repeat: 10, Count: 300}) + stringCol.PreCalculateScalar() + coll.Columns[2] = stringCol + node2 := &StatsNode{ID: 2, Tp: colType, Selectivity: 0.6} + node2.Ranges = append(node2.Ranges, &ranger.Range{LowVal: types.MakeDatums(nil), HighVal: types.MakeDatums(nil)}) + node2.Ranges = append(node2.Ranges, &ranger.Range{LowVal: []types.Datum{types.MinNotNullDatum()}, HighVal: types.MakeDatums("aaa")}) + node2.Ranges = append(node2.Ranges, &ranger.Range{LowVal: types.MakeDatums("aaaaaaaaaaa"), HighVal: types.MakeDatums("aaaaaaaaaaaaaa")}) + node2.Ranges = append(node2.Ranges, &ranger.Range{LowVal: types.MakeDatums("bbb"), HighVal: types.MakeDatums("cccc")}) + node2.Ranges = append(node2.Ranges, &ranger.Range{LowVal: types.MakeDatums("ddd"), HighVal: types.MakeDatums("fff")}) + node2.Ranges = append(node2.Ranges, &ranger.Range{LowVal: types.MakeDatums("ggg"), HighVal: []types.Datum{types.MaxValueDatum()}}) + stringColResult := `column:2 ndv:9 totColSize:0 +num: 60 lower_bound: a upper_bound: aaaabbbb repeats: 0 +num: 60 lower_bound: bbbb upper_bound: fdsfdsfds repeats: 20 +num: 60 lower_bound: kkkkk upper_bound: ooooo repeats: 20 +num: 60 lower_bound: oooooo upper_bound: sssss repeats: 20 +num: 60 lower_bound: ssssssu upper_bound: yyyyy repeats: 0` + + newColl := coll.NewHistCollBySelectivity(sc, []*StatsNode{node, node2}) + c.Assert(newColl.Columns[1].String(), Equals, intColResult) + c.Assert(newColl.Columns[2].String(), Equals, stringColResult) + + idx := &Index{Info: &model.IndexInfo{Columns: []*model.IndexColumn{{Name: model.NewCIStr("a"), Offset: 0}}}} + idx.Histogram = *NewHistogram(0, 15, 0, 0, types.NewFieldType(mysql.TypeBlob), 0, 0) + for i := 0; i < 5; i++ { + low, err1 := codec.EncodeKey(sc, nil, types.NewIntDatum(int64(i*3))) + c.Assert(err1, IsNil, Commentf("Test failed: %v", err1)) + high, err2 := codec.EncodeKey(sc, nil, types.NewIntDatum(int64(i*3+2))) + c.Assert(err2, IsNil, Commentf("Test failed: %v", err2)) + idx.Bounds.AppendBytes(0, low) + idx.Bounds.AppendBytes(0, high) + idx.Buckets = append(idx.Buckets, Bucket{Repeat: 10, Count: int64(30*i + 30)}) + } + idx.PreCalculateScalar() + node3 := &StatsNode{ID: 0, Tp: indexType, Selectivity: 0.47} + node3.Ranges = append(node3.Ranges, &ranger.Range{LowVal: types.MakeDatums(2), HighVal: types.MakeDatums(3)}) + node3.Ranges = append(node3.Ranges, &ranger.Range{LowVal: types.MakeDatums(10), HighVal: types.MakeDatums(13)}) + + idxResult := `index:0 ndv:7 +num: 30 lower_bound: 0 upper_bound: 2 repeats: 10 +num: 30 lower_bound: 3 upper_bound: 5 repeats: 10 +num: 30 lower_bound: 9 upper_bound: 11 repeats: 10 +num: 30 lower_bound: 12 upper_bound: 14 repeats: 10` + + newIdx, err := idx.newIndexBySelectivity(sc, node3) + c.Assert(err, IsNil, Commentf("Test failed: %v", err)) + c.Assert(newIdx.String(), Equals, idxResult) +} diff --git a/statistics/selectivity.go b/statistics/selectivity.go index 239c884b793bb..dad09f5adc2d4 100644 --- a/statistics/selectivity.go +++ b/statistics/selectivity.go @@ -27,19 +27,21 @@ import ( // If one condition can't be calculated, we will assume that the selectivity of this condition is 0.8. const selectionFactor = 0.8 -// exprSet is used for calculating selectivity. -type exprSet struct { - tp int +// StatsNode is used for calculating selectivity. +type StatsNode struct { + Tp int ID int64 // mask is a bit pattern whose ith bit will indicate whether the ith expression is covered by this index/column. mask int64 - // ranges contains all the ranges we got. - ranges []*ranger.Range + // Ranges contains all the Ranges we got. + Ranges []*ranger.Range + // Selectivity indicates the Selectivity of this column/index. + Selectivity float64 // numCols is the number of columns contained in the index or column(which is always 1). numCols int } -// The type of the exprSet. +// The type of the StatsNode. const ( indexType = iota pkType @@ -142,18 +144,18 @@ func isColEqCorCol(filter expression.Expression) *expression.Column { // And exprs must be CNF now, in other words, `exprs[0] and exprs[1] and ... and exprs[len - 1]` should be held when you call this. // TODO: support expressions that the top layer is a DNF. // Currently the time complexity is o(n^2). -func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Expression) (float64, error) { +func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Expression) (float64, []*StatsNode, error) { // If table's count is zero or conditions are empty, we should return 100% selectivity. if coll.Count == 0 || len(exprs) == 0 { - return 1, nil + return 1, nil, nil } // TODO: If len(exprs) is bigger than 63, we could use bitset structure to replace the int64. // This will simplify some code and speed up if we use this rather than a boolean slice. if len(exprs) > 63 || (len(coll.Columns) == 0 && len(coll.Indices) == 0) { - return pseudoSelectivity(coll, exprs), nil + return pseudoSelectivity(coll, exprs), nil, nil } ret := 1.0 - var sets []*exprSet + var nodes []*StatsNode sc := ctx.GetSessionVars().StmtCtx remainedExprs := make([]expression.Expression, 0, len(exprs)) @@ -186,12 +188,24 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp if col != nil { maskCovered, ranges, err := getMaskAndRanges(ctx, remainedExprs, ranger.ColumnRangeType, nil, col) if err != nil { - return 0, errors.Trace(err) + return 0, nil, errors.Trace(err) } - sets = append(sets, &exprSet{tp: colType, ID: id, mask: maskCovered, ranges: ranges, numCols: 1}) + nodes = append(nodes, &StatsNode{Tp: colType, ID: id, mask: maskCovered, Ranges: ranges, numCols: 1}) if colInfo.isHandle { - sets[len(sets)-1].tp = pkType + nodes[len(nodes)-1].Tp = pkType + var cnt float64 + cnt, err = coll.GetRowCountByIntColumnRanges(sc, id, ranges) + if err != nil { + return 0, nil, errors.Trace(err) + } + nodes[len(nodes)-1].Selectivity = cnt / float64(coll.Count) + continue + } + cnt, err := coll.GetRowCountByColumnRanges(sc, id, ranges) + if err != nil { + return 0, nil, errors.Trace(err) } + nodes[len(nodes)-1].Selectivity = cnt / float64(coll.Count) } } for id, idxInfo := range coll.Indices { @@ -203,38 +217,35 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp } maskCovered, ranges, err := getMaskAndRanges(ctx, remainedExprs, ranger.IndexRangeType, lengths, idxCols...) if err != nil { - return 0, errors.Trace(err) + return 0, nil, errors.Trace(err) + } + cnt, err := coll.GetRowCountByIndexRanges(sc, id, ranges) + if err != nil { + return 0, nil, errors.Trace(err) } - sets = append(sets, &exprSet{tp: indexType, ID: id, mask: maskCovered, ranges: ranges, numCols: len(idxInfo.Info.Columns)}) + selectivity := cnt / float64(coll.Count) + nodes = append(nodes, &StatsNode{ + Tp: indexType, + ID: id, + mask: maskCovered, + Ranges: ranges, + numCols: len(idxInfo.Info.Columns), + Selectivity: selectivity, + }) } } - sets = getUsableSetsByGreedy(sets) + usedSets := getUsableSetsByGreedy(nodes) // Initialize the mask with the full set. mask := (int64(1) << uint(len(remainedExprs))) - 1 - for _, set := range sets { - mask ^= set.mask - var ( - rowCount float64 - err error - ) - switch set.tp { - case pkType: - rowCount, err = coll.GetRowCountByIntColumnRanges(sc, set.ID, set.ranges) - case colType: - rowCount, err = coll.GetRowCountByColumnRanges(sc, set.ID, set.ranges) - case indexType: - rowCount, err = coll.GetRowCountByIndexRanges(sc, set.ID, set.ranges) - } - if err != nil { - return 0, errors.Trace(err) - } - ret *= rowCount / float64(coll.Count) + for _, set := range usedSets { + mask &^= set.mask + ret *= set.Selectivity } // If there's still conditions which cannot be calculated, we will multiply a selectionFactor. if mask > 0 { ret *= selectionFactor } - return ret, nil + return ret, nodes, nil } func getMaskAndRanges(ctx sessionctx.Context, exprs []expression.Expression, rangeType ranger.RangeType, @@ -265,14 +276,18 @@ func getMaskAndRanges(ctx sessionctx.Context, exprs []expression.Expression, ran } // getUsableSetsByGreedy will select the indices and pk used for calculate selectivity by greedy algorithm. -func getUsableSetsByGreedy(sets []*exprSet) (newBlocks []*exprSet) { +func getUsableSetsByGreedy(nodes []*StatsNode) (newBlocks []*StatsNode) { + marked := make([]bool, len(nodes)) mask := int64(math.MaxInt64) for { // Choose the index that covers most. - bestID, bestCount, bestTp, bestNumCols := -1, 0, colType, 0 - for i, set := range sets { - set.mask &= mask - bits := popCount(set.mask) + bestID, bestCount, bestTp, bestNumCols, bestMask := -1, 0, colType, 0, int64(0) + for i, set := range nodes { + if marked[i] { + continue + } + curMask := set.mask & mask + bits := popCount(curMask) // This set cannot cover any thing, just skip it. if bits == 0 { continue @@ -281,20 +296,19 @@ func getUsableSetsByGreedy(sets []*exprSet) (newBlocks []*exprSet) { // (1): The stats type, always prefer the primary key or index. // (2): The number of expression that it covers, the more the better. // (3): The number of columns that it contains, the less the better. - if (bestTp == colType && set.tp != colType) || bestCount < bits || (bestCount == bits && bestNumCols > set.numCols) { - bestID, bestCount, bestTp, bestNumCols = i, bits, set.tp, set.numCols + if (bestTp == colType && set.Tp != colType) || bestCount < bits || (bestCount == bits && bestNumCols > set.numCols) { + bestID, bestCount, bestTp, bestNumCols, bestMask = i, bits, set.Tp, set.numCols, curMask } } if bestCount == 0 { break } - // update the mask, remove the bit that sets[bestID].mask has. - mask &^= sets[bestID].mask + // update the mask, remove the bit that nodes[bestID].mask has. + mask &^= bestMask - newBlocks = append(newBlocks, sets[bestID]) - // remove the chosen one - sets = append(sets[:bestID], sets[bestID+1:]...) + newBlocks = append(newBlocks, nodes[bestID]) + marked[bestID] = true } return } diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index faa45cb11b706..7956dff6741b5 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -179,12 +179,12 @@ func (s *testStatsSuite) TestSelectivity(c *C) { histColl := statsTbl.GenerateHistCollFromColumnInfo(ds.Columns, ds.Schema().Columns) - ratio, err := histColl.Selectivity(ctx, sel.Conditions) + ratio, _, err := histColl.Selectivity(ctx, sel.Conditions) c.Assert(err, IsNil, comment) c.Assert(math.Abs(ratio-tt.selectivity) < eps, IsTrue, Commentf("for %s, needed: %v, got: %v", tt.exprs, tt.selectivity, ratio)) histColl.Count *= 10 - ratio, err = histColl.Selectivity(ctx, sel.Conditions) + ratio, _, err = histColl.Selectivity(ctx, sel.Conditions) c.Assert(err, IsNil, comment) c.Assert(math.Abs(ratio-tt.selectivity) < eps, IsTrue, Commentf("for %s, needed: %v, got: %v", tt.exprs, tt.selectivity, ratio)) } @@ -350,10 +350,10 @@ func BenchmarkSelectivity(b *testing.B) { defer file.Close() pprof.StartCPUProfile(file) - b.Run("selectivity", func(b *testing.B) { + b.Run("Selectivity", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - _, err := statsTbl.Selectivity(ctx, p.(plannercore.LogicalPlan).Children()[0].(*plannercore.LogicalSelection).Conditions) + _, _, err := statsTbl.Selectivity(ctx, p.(plannercore.LogicalPlan).Children()[0].(*plannercore.LogicalSelection).Conditions) c.Assert(err, IsNil) } b.ReportAllocs() diff --git a/statistics/table.go b/statistics/table.go index 4d65063143ddc..b46ab432d2b28 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -466,7 +466,7 @@ func getOrdinalOfRangeCond(sc *stmtctx.StatementContext, ran *ranger.Range) int } // GenerateHistCollFromColumnInfo generates a new HistColl whose ColID2IdxID and IdxID2ColIDs is built from the given parameter. -func (coll *HistColl) GenerateHistCollFromColumnInfo(infos []*model.ColumnInfo, columns []*expression.Column) HistColl { +func (coll *HistColl) GenerateHistCollFromColumnInfo(infos []*model.ColumnInfo, columns []*expression.Column) *HistColl { newColHistMap := make(map[int64]*Column) colInfoID2UniqueID := make(map[int64]int64) colNames2UniqueID := make(map[string]int64) @@ -506,7 +506,7 @@ func (coll *HistColl) GenerateHistCollFromColumnInfo(infos []*model.ColumnInfo, newIdxHistMap[idxHist.ID] = idxHist idx2Columns[idxHist.ID] = ids } - newColl := HistColl{ + newColl := &HistColl{ PhysicalID: coll.PhysicalID, HavePhysicalID: coll.HavePhysicalID, Pseudo: coll.Pseudo, diff --git a/statistics/update.go b/statistics/update.go index b65c4cdbb7734..bc1e03e739830 100644 --- a/statistics/update.go +++ b/statistics/update.go @@ -562,7 +562,7 @@ func (h *Handle) handleSingleHistogramUpdate(is infoschema.InfoSchema, rows []ch } q := &QueryFeedback{} for _, row := range rows { - err1 := decodeFeedback(row.GetBytes(3), q, cms, mysql.HasUnsignedFlag(hist.tp.Flag)) + err1 := decodeFeedback(row.GetBytes(3), q, cms, mysql.HasUnsignedFlag(hist.Tp.Flag)) if err1 != nil { log.Debugf("decode feedback failed, err: %v", errors.ErrorStack(err)) } diff --git a/statistics/update_test.go b/statistics/update_test.go index 4e4d18a44faeb..0684d341e4a40 100644 --- a/statistics/update_test.go +++ b/statistics/update_test.go @@ -701,7 +701,7 @@ func (s *testStatsSuite) TestSplitRange(c *C) { HighExclude: t.exclude[i+1], }) } - ranges = h.SplitRange(ranges) + ranges = h.SplitRange(nil, ranges, false) var ranStrs []string for _, ran := range ranges { ranStrs = append(ranStrs, ran.String()) @@ -782,7 +782,7 @@ func (s *testStatsSuite) TestQueryFeedback(c *C) { feedback := h.GetQueryFeedback() c.Assert(len(feedback), Equals, 0) - // Test only collect for max number of ranges. + // Test only collect for max number of Ranges. statistics.MaxNumberOfRanges = 0 for _, t := range tests { testKit.MustQuery(t.sql) diff --git a/types/datum.go b/types/datum.go index ed340e00aaf33..b8bea0dc25452 100644 --- a/types/datum.go +++ b/types/datum.go @@ -218,6 +218,12 @@ func (d *Datum) SetNull() { d.x = nil } +// SetMinNotNull sets datum to minNotNull value. +func (d *Datum) SetMinNotNull() { + d.k = KindMinNotNull + d.x = nil +} + // GetBinaryLiteral gets Bit value func (d *Datum) GetBinaryLiteral() BinaryLiteral { return d.b diff --git a/util/ranger/types.go b/util/ranger/types.go index 5db46355353db..ceed9531b2e36 100644 --- a/util/ranger/types.go +++ b/util/ranger/types.go @@ -19,8 +19,10 @@ import ( "strings" "github.com/pingcap/errors" + "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/sessionctx/stmtctx" "github.com/pingcap/tidb/types" + "github.com/pingcap/tidb/util/codec" ) // Range represents a range generated in physical plan building phase. @@ -95,6 +97,26 @@ func (ran *Range) String() string { return l + strings.Join(lowStrs, " ") + "," + strings.Join(highStrs, " ") + r } +// Encode encodes the range to its encoded value. +func (ran *Range) Encode(sc *stmtctx.StatementContext, lowBuffer, highBuffer []byte) ([]byte, []byte, error) { + var err error + lowBuffer, err = codec.EncodeKey(sc, lowBuffer[:0], ran.LowVal...) + if err != nil { + return nil, nil, err + } + if ran.LowExclude { + lowBuffer = kv.Key(lowBuffer).PrefixNext() + } + highBuffer, err = codec.EncodeKey(sc, highBuffer[:0], ran.HighVal...) + if err != nil { + return nil, nil, err + } + if !ran.HighExclude { + highBuffer = kv.Key(highBuffer).PrefixNext() + } + return lowBuffer, highBuffer, nil +} + // PrefixEqualLen tells you how long the prefix of the range is a point. // e.g. If this range is (1 2 3, 1 2 +inf), then the return value is 2. func (ran *Range) PrefixEqualLen(sc *stmtctx.StatementContext) (int, error) {