Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

statistics: fix estimation error when ranges are too many and modify count is large #40472

Merged
merged 11 commits into from
Jan 11, 2023
1 change: 1 addition & 0 deletions statistics/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ go_test(
"@com_github_pingcap_failpoint//:failpoint",
"@com_github_pingcap_log//:log",
"@com_github_stretchr_testify//require",
"@org_golang_x_exp//slices",
"@org_uber_go_goleak//:goleak",
"@org_uber_go_zap//:zap",
],
Expand Down
10 changes: 6 additions & 4 deletions statistics/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang
totalCount := float64(0)
isSingleCol := len(idx.Info.Columns) == 1
for _, indexRange := range indexRanges {
var count float64
lb, err := codec.EncodeKey(sc, nil, indexRange.LowVal...)
if err != nil {
return 0, err
Expand All @@ -242,7 +243,7 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang
totalCount++
continue
}
count := idx.equalRowCount(lb, realtimeRowCount)
count = idx.equalRowCount(lb, realtimeRowCount)
// If the current table row count has changed, we should scale the row count accordingly.
count *= idx.GetIncreaseFactor(realtimeRowCount)
totalCount += count
Expand All @@ -262,7 +263,7 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang
r := types.NewBytesDatum(rb)
lowIsNull := bytes.Equal(lb, nullKeyBytes)
if isSingleCol && lowIsNull {
totalCount += float64(idx.Histogram.NullCount)
count += float64(idx.Histogram.NullCount)
}
expBackoffSuccess := false
// Due to the limitation of calcFraction and convertDatumToScalar, the histogram actually won't estimate anything.
Expand Down Expand Up @@ -301,16 +302,17 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang
}
}
if !expBackoffSuccess {
totalCount += idx.BetweenRowCount(l, r)
count += idx.BetweenRowCount(l, r)
}

// If the current table row count has changed, we should scale the row count accordingly.
totalCount *= idx.GetIncreaseFactor(realtimeRowCount)
count *= idx.GetIncreaseFactor(realtimeRowCount)

// handling the out-of-range part
if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) {
totalCount += idx.Histogram.outOfRangeRowCount(&l, &r, modifyCount)
}
totalCount += count
}
totalCount = mathutil.Clamp(totalCount, 0, float64(realtimeRowCount))
return totalCount, nil
Expand Down
84 changes: 83 additions & 1 deletion statistics/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ import (
"github.com/pingcap/tidb/util/mock"
"github.com/pingcap/tidb/util/ranger"
"github.com/stretchr/testify/require"
"golang.org/x/exp/slices"
)

func TestCollationColumnEstimate(t *testing.T) {
Expand Down Expand Up @@ -891,7 +892,7 @@ func prepareSelectivity(testKit *testkit.TestKit, dom *domain.Domain) (*statisti
return statsTbl, nil
}

func getRange(start, end int64) []*ranger.Range {
func getRange(start, end int64) ranger.Ranges {
ran := &ranger.Range{
LowVal: []types.Datum{types.NewIntDatum(start)},
HighVal: []types.Datum{types.NewIntDatum(end)},
Expand All @@ -900,6 +901,21 @@ func getRange(start, end int64) []*ranger.Range {
return []*ranger.Range{ran}
}

func getRanges(start, end []int64) (res ranger.Ranges) {
if len(start) != len(end) {
return nil
}
for i := range start {
ran := &ranger.Range{
LowVal: []types.Datum{types.NewIntDatum(start[i])},
HighVal: []types.Datum{types.NewIntDatum(end[i])},
Collators: collate.GetBinaryCollatorSlice(1),
}
res = append(res, ran)
}
return
}

func TestSelectivityGreedyAlgo(t *testing.T) {
nodes := make([]*statistics.StatsNode, 3)
nodes[0] = statistics.MockStatsNode(1, 3, 2)
Expand Down Expand Up @@ -1075,3 +1091,69 @@ func TestGlobalStatsOutOfRangeEstimationAfterDelete(t *testing.T) {
testKit.MustQuery(input[i]).Check(testkit.Rows(output[i].Result...))
}
}

func generateMapsForMockStatsTbl(statsTbl *statistics.Table) {
idx2Columns := make(map[int64][]int64)
colID2IdxIDs := make(map[int64][]int64)
for _, idxHist := range statsTbl.Indices {
ids := make([]int64, 0, len(idxHist.Info.Columns))
for _, idxCol := range idxHist.Info.Columns {
ids = append(ids, int64(idxCol.Offset))
}
colID2IdxIDs[ids[0]] = append(colID2IdxIDs[ids[0]], idxHist.ID)
idx2Columns[idxHist.ID] = ids
}
for _, idxIDs := range colID2IdxIDs {
slices.Sort(idxIDs)
}
statsTbl.Idx2ColumnIDs = idx2Columns
statsTbl.ColID2IdxIDs = colID2IdxIDs
}

func TestIssue39593(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
testKit := testkit.NewTestKit(t, store)

testKit.MustExec("use test")
testKit.MustExec("drop table if exists t")
testKit.MustExec("create table t(a int, b int, index idx(a, b))")
is := dom.InfoSchema()
tb, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
require.NoError(t, err)
tblInfo := tb.Meta()

// mock the statistics.Table
statsTbl := mockStatsTable(tblInfo, 540)
colValues, err := generateIntDatum(1, 54)
require.NoError(t, err)
for i := 1; i <= 2; i++ {
statsTbl.Columns[int64(i)] = &statistics.Column{
Histogram: *mockStatsHistogram(int64(i), colValues, 10, types.NewFieldType(mysql.TypeLonglong)),
Info: tblInfo.Columns[i-1],
StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
StatsVer: 2,
}
}
idxValues, err := generateIntDatum(2, 3)
require.NoError(t, err)
tp := types.NewFieldType(mysql.TypeBlob)
statsTbl.Indices[1] = &statistics.Index{
Histogram: *mockStatsHistogram(1, idxValues, 60, tp),
Info: tblInfo.Indices[0],
StatsVer: 2,
}
generateMapsForMockStatsTbl(statsTbl)

sctx := testKit.Session()
idxID := tblInfo.Indices[0].ID
vals := []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
count, err := statsTbl.GetRowCountByIndexRanges(sctx, idxID, getRanges(vals, vals))
require.NoError(t, err)
// estimated row count without any changes
require.Equal(t, float64(360), count)
statsTbl.Count *= 10
count, err = statsTbl.GetRowCountByIndexRanges(sctx, idxID, getRanges(vals, vals))
require.NoError(t, err)
// estimated row count after mock modify on the table
require.Equal(t, float64(3600), count)
}