Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats: fix estimation for out of range point queries #8015

Merged
merged 3 commits into from
Oct 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions cmd/explaintest/r/explain_complex_stats.result
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,11 @@ Projection_5 39.28 root test.st.cm, test.st.p1, test.st.p2, test.st.p3, test.st.
└─TableScan_14 160.23 cop table:st, keep order:false
explain select dt.id as id, dt.aid as aid, dt.pt as pt, dt.dic as dic, dt.cm as cm, rr.gid as gid, rr.acd as acd, rr.t as t,dt.p1 as p1, dt.p2 as p2, dt.p3 as p3, dt.p4 as p4, dt.p5 as p5, dt.p6_md5 as p6, dt.p7_md5 as p7 from dt dt join rr rr on (rr.pt = 'ios' and rr.t > 1478185592 and dt.aid = rr.aid and dt.dic = rr.dic) where dt.pt = 'ios' and dt.t > 1478185592 and dt.bm = 0 limit 2000;
id count task operator info
Projection_9 428.55 root dt.id, dt.aid, dt.pt, dt.dic, dt.cm, rr.gid, rr.acd, rr.t, dt.p1, dt.p2, dt.p3, dt.p4, dt.p5, dt.p6_md5, dt.p7_md5
└─Limit_12 428.55 root offset:0, count:2000
└─IndexJoin_18 428.55 root inner join, inner:IndexLookUp_17, outer key:dt.aid, dt.dic, inner key:rr.aid, rr.dic
├─TableReader_42 428.55 root data:Selection_41
│ └─Selection_41 428.55 cop eq(dt.bm, 0), eq(dt.pt, "ios"), gt(dt.t, 1478185592)
Projection_9 428.32 root dt.id, dt.aid, dt.pt, dt.dic, dt.cm, rr.gid, rr.acd, rr.t, dt.p1, dt.p2, dt.p3, dt.p4, dt.p5, dt.p6_md5, dt.p7_md5
└─Limit_12 428.32 root offset:0, count:2000
└─IndexJoin_18 428.32 root inner join, inner:IndexLookUp_17, outer key:dt.aid, dt.dic, inner key:rr.aid, rr.dic
├─TableReader_42 428.32 root data:Selection_41
│ └─Selection_41 428.32 cop eq(dt.bm, 0), eq(dt.pt, "ios"), gt(dt.t, 1478185592)
│ └─TableScan_40 2000.00 cop table:dt, range:[0,+inf], keep order:false
└─IndexLookUp_17 970.00 root
├─IndexScan_14 1.00 cop table:rr, index:aid, dic, range: decided by [dt.aid dt.dic], keep order:false
Expand Down
6 changes: 3 additions & 3 deletions cmd/explaintest/r/explain_easy_stats.result
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ explain select * from t1 left join t2 on t1.c2 = t2.c1 where t1.c1 > 1;
id count task operator info
Projection_6 2481.25 root test.t1.c1, test.t1.c2, test.t1.c3, test.t2.c1, test.t2.c2
└─MergeJoin_7 2481.25 root left outer join, left key:test.t1.c2, right key:test.t2.c1
├─IndexLookUp_17 1999.00 root
│ ├─Selection_16 1999.00 cop gt(test.t1.c1, 1)
├─IndexLookUp_17 1998.00 root
│ ├─Selection_16 1998.00 cop gt(test.t1.c1, 1)
│ │ └─IndexScan_14 1999.00 cop table:t1, index:c2, range:[NULL,+inf], keep order:true
│ └─TableScan_15 1999.00 cop table:t1, keep order:false
│ └─TableScan_15 1998.00 cop table:t1, keep order:false
└─IndexLookUp_21 1985.00 root
├─IndexScan_19 1985.00 cop table:t2, index:c1, range:[NULL,+inf], keep order:true
└─TableScan_20 1985.00 cop table:t2, keep order:false
Expand Down
20 changes: 10 additions & 10 deletions cmd/explaintest/r/tpch.result
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ limit 10;
id count task operator info
Projection_14 10.00 root tpch.lineitem.l_orderkey, 7_col_0, tpch.orders.o_orderdate, tpch.orders.o_shippriority
└─TopN_17 10.00 root 7_col_0:desc, tpch.orders.o_orderdate:asc, offset:0, count:10
└─HashAgg_20 40256361.71 root group by:tpch.lineitem.l_orderkey, tpch.orders.o_orderdate, tpch.orders.o_shippriority, funcs:sum(mul(tpch.lineitem.l_extendedprice, minus(1, tpch.lineitem.l_discount))), firstrow(tpch.orders.o_orderdate), firstrow(tpch.orders.o_shippriority), firstrow(tpch.lineitem.l_orderkey)
└─HashAgg_20 40227041.09 root group by:tpch.lineitem.l_orderkey, tpch.orders.o_orderdate, tpch.orders.o_shippriority, funcs:sum(mul(tpch.lineitem.l_extendedprice, minus(1, tpch.lineitem.l_discount))), firstrow(tpch.orders.o_orderdate), firstrow(tpch.orders.o_shippriority), firstrow(tpch.lineitem.l_orderkey)
└─IndexJoin_26 91515927.49 root inner join, inner:IndexLookUp_25, outer key:tpch.orders.o_orderkey, inner key:tpch.lineitem.l_orderkey
├─HashRightJoin_46 22592975.51 root inner join, inner:TableReader_52, equal:[eq(tpch.customer.c_custkey, tpch.orders.o_custkey)]
│ ├─TableReader_52 1498236.00 root data:Selection_51
Expand All @@ -260,9 +260,9 @@ Projection_14 10.00 root tpch.lineitem.l_orderkey, 7_col_0, tpch.orders.o_orderd
│ └─TableReader_49 36870000.00 root data:Selection_48
│ └─Selection_48 36870000.00 cop lt(tpch.orders.o_orderdate, 1995-03-13 00:00:00.000000)
│ └─TableScan_47 75000000.00 cop table:orders, range:[-inf,+inf], keep order:false
└─IndexLookUp_25 163063881.42 root
└─IndexLookUp_25 162945114.27 root
├─IndexScan_22 1.00 cop table:lineitem, index:L_ORDERKEY, L_LINENUMBER, range: decided by [tpch.orders.o_orderkey], keep order:false
└─Selection_24 163063881.42 cop gt(tpch.lineitem.l_shipdate, 1995-03-13 00:00:00.000000)
└─Selection_24 162945114.27 cop gt(tpch.lineitem.l_shipdate, 1995-03-13 00:00:00.000000)
└─TableScan_23 1.00 cop table:lineitem, keep order:false
/*
Q4 Order Priority Checking Query
Expand Down Expand Up @@ -922,13 +922,13 @@ p_brand,
p_type,
p_size;
id count task operator info
Sort_13 15.00 root supplier_cnt:desc, tpch.part.p_brand:asc, tpch.part.p_type:asc, tpch.part.p_size:asc
└─Projection_14 15.00 root tpch.part.p_brand, tpch.part.p_type, tpch.part.p_size, 9_col_0
└─HashAgg_17 15.00 root group by:tpch.part.p_brand, tpch.part.p_size, tpch.part.p_type, funcs:count(distinct tpch.partsupp.ps_suppkey), firstrow(tpch.part.p_brand), firstrow(tpch.part.p_type), firstrow(tpch.part.p_size)
└─HashLeftJoin_22 4022816.68 root anti semi join, inner:TableReader_46, equal:[eq(tpch.partsupp.ps_suppkey, tpch.supplier.s_suppkey)]
├─IndexJoin_26 5028520.85 root inner join, inner:IndexReader_25, outer key:tpch.part.p_partkey, inner key:tpch.partsupp.ps_partkey
│ ├─TableReader_41 1249969.60 root data:Selection_40
│ │ └─Selection_40 1249969.60 cop in(tpch.part.p_size, 48, 19, 12, 4, 41, 7, 21, 39), ne(tpch.part.p_brand, "Brand#34"), not(like(tpch.part.p_type, "LARGE BRUSHED%", 92))
Sort_13 14.41 root supplier_cnt:desc, tpch.part.p_brand:asc, tpch.part.p_type:asc, tpch.part.p_size:asc
└─Projection_14 14.41 root tpch.part.p_brand, tpch.part.p_type, tpch.part.p_size, 9_col_0
└─HashAgg_17 14.41 root group by:tpch.part.p_brand, tpch.part.p_size, tpch.part.p_type, funcs:count(distinct tpch.partsupp.ps_suppkey), firstrow(tpch.part.p_brand), firstrow(tpch.part.p_type), firstrow(tpch.part.p_size)
└─HashLeftJoin_22 3863988.24 root anti semi join, inner:TableReader_46, equal:[eq(tpch.partsupp.ps_suppkey, tpch.supplier.s_suppkey)]
├─IndexJoin_26 4829985.30 root inner join, inner:IndexReader_25, outer key:tpch.part.p_partkey, inner key:tpch.partsupp.ps_partkey
│ ├─TableReader_41 1200618.43 root data:Selection_40
│ │ └─Selection_40 1200618.43 cop in(tpch.part.p_size, 48, 19, 12, 4, 41, 7, 21, 39), ne(tpch.part.p_brand, "Brand#34"), not(like(tpch.part.p_type, "LARGE BRUSHED%", 92))
│ │ └─TableScan_39 10000000.00 cop table:part, range:[-inf,+inf], keep order:false
│ └─IndexReader_25 1.00 root index:IndexScan_24
│ └─IndexScan_24 1.00 cop table:partsupp, index:PS_PARTKEY, PS_SUPPKEY, range: decided by [tpch.part.p_partkey], keep order:false
Expand Down
2 changes: 1 addition & 1 deletion statistics/ddl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ func (s *testStatsCacheSuite) TestDDLHistogram(c *C) {
c.Assert(count, Equals, float64(2))
count, err = statsTbl.ColumnEqualRowCount(sc, types.NewIntDatum(1), tableInfo.Columns[3].ID)
c.Assert(err, IsNil)
c.Assert(count, Equals, float64(2))
c.Assert(count, Equals, float64(0))

testKit.MustExec("alter table t add column c4 datetime NOT NULL default CURRENT_TIMESTAMP")
err = h.HandleDDLEvent(<-h.DDLEventCh())
Expand Down
16 changes: 8 additions & 8 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,7 @@ func (c *Column) String() string {
return c.Histogram.ToString(0)
}

func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum) (float64, error) {
func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, modifyCount int64) (float64, error) {
if val.IsNull() {
return float64(c.NullCount), nil
}
Expand All @@ -738,7 +738,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum) (f
return 0.0, nil
}
if c.NDV > 0 && c.outOfRange(val) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not related to this PR, just wonder why we need a c.NDV > 0 check here when we have already checked c.Histogram.Bounds == nil.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to make it clear that it is greater than 0, but actually, we can remove it.

return c.totalRowCount() / (float64(c.NDV)), nil
return float64(modifyCount) / float64(c.NDV), nil
zz-jason marked this conversation as resolved.
Show resolved Hide resolved
}
if c.CMSketch != nil {
count, err := c.CMSketch.queryValue(sc, val)
Expand All @@ -759,7 +759,7 @@ func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
// the point case.
if !rg.LowExclude && !rg.HighExclude {
var cnt float64
cnt, err = c.equalRowCount(sc, rg.LowVal[0])
cnt, err = c.equalRowCount(sc, rg.LowVal[0], modifyCount)
if err != nil {
return 0, errors.Trace(err)
}
Expand All @@ -773,14 +773,14 @@ func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
cnt += float64(modifyCount) / outOfRangeBetweenRate
}
if rg.LowExclude {
lowCnt, err := c.equalRowCount(sc, rg.LowVal[0])
lowCnt, err := c.equalRowCount(sc, rg.LowVal[0], modifyCount)
if err != nil {
return 0, errors.Trace(err)
}
cnt -= lowCnt
}
if !rg.HighExclude {
highCnt, err := c.equalRowCount(sc, rg.HighVal[0])
highCnt, err := c.equalRowCount(sc, rg.HighVal[0], modifyCount)
if err != nil {
return 0, errors.Trace(err)
}
Expand Down Expand Up @@ -809,10 +809,10 @@ func (idx *Index) String() string {
return idx.Histogram.ToString(len(idx.Info.Columns))
}

func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte) float64 {
func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCount int64) float64 {
val := types.NewBytesDatum(b)
if idx.NDV > 0 && idx.outOfRange(val) {
return idx.totalRowCount() / (float64(idx.NDV))
return float64(modifyCount) / (float64(idx.NDV))
}
if idx.CMSketch != nil {
return float64(idx.CMSketch.QueryBytes(b))
Expand All @@ -834,7 +834,7 @@ func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*range
fullLen := len(indexRange.LowVal) == len(indexRange.HighVal) && len(indexRange.LowVal) == len(idx.Info.Columns)
if fullLen && bytes.Equal(lb, rb) {
if !indexRange.LowExclude && !indexRange.HighExclude {
totalCount += idx.equalRowCount(sc, lb)
totalCount += idx.equalRowCount(sc, lb, modifyCount)
}
continue
}
Expand Down
22 changes: 20 additions & 2 deletions statistics/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ func (s *testSelectivitySuite) TestSelectivity(c *C) {
},
{
exprs: "a >= 1 and b > 1 and a < 2",
selectivity: 0.01817558299,
selectivity: 0.01783264746,
},
{
exprs: "a >= 1 and c > 1 and a < 2",
Expand All @@ -174,7 +174,7 @@ func (s *testSelectivitySuite) TestSelectivity(c *C) {
},
{
exprs: "b > 1",
selectivity: 0.98148148148,
selectivity: 0.96296296296,
},
{
exprs: "a > 1 and b < 2 and c > 3 and d < 4 and e > 5",
Expand Down Expand Up @@ -304,6 +304,24 @@ func (s *testSelectivitySuite) TestEstimationForUnknownValues(c *C) {
count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(1, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 0.0)

testKit.MustExec("drop table t")
testKit.MustExec("create table t(a int, b int, index idx(b))")
testKit.MustExec("insert into t values (1,1)")
testKit.MustExec("analyze table t")
table, err = s.dom.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
c.Assert(err, IsNil)
statsTbl = h.GetTableStats(table.Meta())

colID = table.Meta().Columns[0].ID
count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(2, 2))
c.Assert(err, IsNil)
c.Assert(count, Equals, 0.0)

idxID = table.Meta().Indices[0].ID
count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(2, 2))
c.Assert(err, IsNil)
c.Assert(count, Equals, 0.0)
}

func BenchmarkSelectivity(b *testing.B) {
Expand Down
4 changes: 2 additions & 2 deletions statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ func (t *Table) ColumnEqualRowCount(sc *stmtctx.StatementContext, value types.Da
return float64(t.Count) / pseudoEqualRate, nil
}
c := t.Columns[colID]
result, err := c.equalRowCount(sc, value)
result, err := c.equalRowCount(sc, value, t.ModifyCount)
result *= c.getIncreaseFactor(t.Count)
return result, errors.Trace(err)
}
Expand Down Expand Up @@ -551,7 +551,7 @@ func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idxID int64
// so we use heuristic methods to estimate the selectivity.
if idx.NDV > 0 && len(ran.LowVal) == len(idx.Info.Columns) && rangePosition == len(ran.LowVal) {
// for equality queries
selectivity = 1.0 / float64(idx.NDV)
selectivity = float64(coll.ModifyCount) / float64(idx.NDV) / idx.totalRowCount()
} else {
// for range queries
selectivity = float64(coll.ModifyCount) / outOfRangeBetweenRate / idx.totalRowCount()
Expand Down