stats: do not split excluded lower value ranges (#12009) (#12172)

pingcap · Sep 13, 2019 · c3c04c6 · c3c04c6
1 parent 9059790
commit c3c04c6
Show file tree

Hide file tree

Showing 4 changed files with 111 additions and 101 deletions.
diff --git a/statistics/feedback.go b/statistics/feedback.go
@@ -309,15 +309,21 @@ func buildBucketFeedback(h *Histogram, feedback *QueryFeedback) (map[int]*Bucket
 		if skip {
 			continue
 		}
-		idx, _ := h.Bounds.LowerBound(0, fb.lower)
+		idx := h.Bounds.UpperBound(0, fb.lower)
 		bktIdx := 0
 		// The last bucket also stores the feedback that falls outside the upper bound.
-		if idx >= h.Bounds.NumRows()-2 {
+		if idx >= h.Bounds.NumRows()-1 {
 			bktIdx = h.Len() - 1
+		} else if h.Len() == 1 {
+			bktIdx = 0
 		} else {
-			bktIdx = idx / 2
+			if idx == 0 {
+				bktIdx = 0
+			} else {
+				bktIdx = (idx - 1) / 2
+			}
 			// Make sure that this feedback lies within the bucket.
-			if chunk.Compare(h.Bounds.GetRow(2*bktIdx+1), 0, fb.upper) < 0 {
+			if chunk.Compare(h.Bounds.GetRow(2*(bktIdx+1)), 0, fb.upper) < 0 {
 				continue
 			}
 		}
@@ -954,11 +960,11 @@ func formatBuckets(hg *Histogram, lowBkt, highBkt, idxCols int) string {
 		return hg.bucketToString(lowBkt, idxCols)
 	}
 	if lowBkt+1 == highBkt {
-		return fmt.Sprintf("%s, %s", hg.bucketToString(lowBkt, 0), hg.bucketToString(highBkt, 0))
+		return fmt.Sprintf("%s, %s", hg.bucketToString(lowBkt, idxCols), hg.bucketToString(highBkt, idxCols))
 	}
 	// do not care the middle buckets
-	return fmt.Sprintf("%s, (%d buckets, total count %d), %s", hg.bucketToString(lowBkt, 0),
-		highBkt-lowBkt-1, hg.Buckets[highBkt-1].Count-hg.Buckets[lowBkt].Count, hg.bucketToString(highBkt, 0))
+	return fmt.Sprintf("%s, (%d buckets, total count %d), %s", hg.bucketToString(lowBkt, idxCols),
+		highBkt-lowBkt-1, hg.Buckets[highBkt-1].Count-hg.Buckets[lowBkt].Count, hg.bucketToString(highBkt, idxCols))
 }
 
 func colRangeToStr(c *Column, ran *ranger.Range, actual int64, factor float64) string {

diff --git a/statistics/feedback_test.go b/statistics/feedback_test.go
@@ -70,14 +70,13 @@ func (s *testFeedbackSuite) TestUpdateHistogram(c *C) {
 	defaultBucketCount = 7
 	defer func() { defaultBucketCount = originBucketCount }()
 	c.Assert(UpdateHistogram(q.Hist(), q).ToString(0), Equals,
-		"column:0 ndv:10058 totColSize:0\n"+
-			"num: 10000 lower_bound: 0 upper_bound: 1 repeats: 0\n"+
-			"num: 9 lower_bound: 2 upper_bound: 7 repeats: 0\n"+
-			"num: 11 lower_bound: 8 upper_bound: 19 repeats: 0\n"+
-			"num: 0 lower_bound: 20 upper_bound: 20 repeats: 0\n"+
-			"num: 18 lower_bound: 21 upper_bound: 39 repeats: 0\n"+
-			"num: 18 lower_bound: 40 upper_bound: 58 repeats: 0\n"+
-			"num: 2 lower_bound: 59 upper_bound: 60 repeats: 0")
+		"column:0 ndv:10053 totColSize:0\n"+
+			"num: 10001 lower_bound: 0 upper_bound: 2 repeats: 0\n"+
+			"num: 7 lower_bound: 2 upper_bound: 5 repeats: 0\n"+
+			"num: 4 lower_bound: 5 upper_bound: 7 repeats: 0\n"+
+			"num: 11 lower_bound: 10 upper_bound: 20 repeats: 0\n"+
+			"num: 19 lower_bound: 30 upper_bound: 49 repeats: 0\n"+
+			"num: 11 lower_bound: 50 upper_bound: 60 repeats: 0")
 }
 
 func (s *testFeedbackSuite) TestSplitBuckets(c *C) {

diff --git a/statistics/histogram.go b/statistics/histogram.go
@@ -559,41 +559,43 @@ func (hg *Histogram) typeMatch(ranges []*ranger.Range) bool {
 	return true
 }
 
-// SplitRange splits the range according to the histogram upper bound. Note that we treat last bucket's upper bound
-// as inf, so all the split ranges will totally fall in one of the (-inf, u(0)], (u(0), u(1)],...(u(n-3), u(n-2)],
-// (u(n-2), +inf), where n is the number of buckets, u(i) is the i-th bucket's upper bound.
+// SplitRange splits the range according to the histogram lower bound. Note that we treat first bucket's lower bound
+// as -inf and last bucket's upper bound as +inf, so all the split ranges will totally fall in one of the (-inf, l(1)),
+// [l(1), l(2)),...[l(n-2), l(n-1)), [l(n-1), +inf), where n is the number of buckets, l(i) is the i-th bucket's lower bound.
 func (hg *Histogram) SplitRange(sc *stmtctx.StatementContext, oldRanges []*ranger.Range, encoded bool) ([]*ranger.Range, bool) {
 	if !hg.typeMatch(oldRanges) {
 		return oldRanges, false
 	}
+	// Treat the only buckets as (-inf, +inf), so we do not need split it.
+	if hg.Len() == 1 {
+		return oldRanges, true
+	}
 	ranges := make([]*ranger.Range, 0, len(oldRanges))
 	for _, ran := range oldRanges {
 		ranges = append(ranges, ran.Clone())
 	}
 	split := make([]*ranger.Range, 0, len(ranges))
 	for len(ranges) > 0 {
-		// Find the last bound that greater or equal to the LowVal.
+		// Find the first bound that greater than the LowVal.
 		idx := hg.Bounds.UpperBound(0, &ranges[0].LowVal[0])
-		if !ranges[0].LowExclude && idx > 0 {
-			cmp := chunk.Compare(hg.Bounds.GetRow(idx-1), 0, &ranges[0].LowVal[0])
-			if cmp == 0 {
-				idx--
-			}
-		}
-		// Treat last bucket's upper bound as inf, so we do not need split any more.
-		if idx >= hg.Bounds.NumRows()-2 {
+		// Treat last bucket's upper bound as +inf, so we do not need split any more.
+		if idx >= hg.Bounds.NumRows()-1 {
 			split = append(split, ranges...)
 			break
 		}
-		// Get the corresponding upper bound.
-		if idx%2 == 0 {
+		// Treat first buckets's lower bound as -inf, just increase it to the next lower bound.
+		if idx == 0 {
+			idx = 2
+		}
+		// Get the next lower bound.
+		if idx%2 == 1 {
 			idx++
 		}
-		upperBound := hg.Bounds.GetRow(idx)
+		lowerBound := hg.Bounds.GetRow(idx)
 		var i int
-		// Find the first range that need to be split by the upper bound.
+		// Find the first range that need to be split by the lower bound.
 		for ; i < len(ranges); i++ {
-			if chunk.Compare(upperBound, 0, &ranges[i].HighVal[0]) < 0 {
+			if chunk.Compare(lowerBound, 0, &ranges[i].HighVal[0]) <= 0 {
 				break
 			}
 		}
@@ -602,17 +604,20 @@ func (hg *Histogram) SplitRange(sc *stmtctx.StatementContext, oldRanges []*range
 		if len(ranges) == 0 {
 			break
 		}
-		// Split according to the upper bound.
-		cmp := chunk.Compare(upperBound, 0, &ranges[0].LowVal[0])
-		if cmp > 0 || (cmp == 0 && !ranges[0].LowExclude) {
-			upper := upperBound.GetDatum(0, hg.tp)
-			split = append(split, &ranger.Range{
+		// Split according to the lower bound.
+		cmp := chunk.Compare(lowerBound, 0, &ranges[0].LowVal[0])
+		if cmp > 0 {
+			lower := lowerBound.GetDatum(0, hg.tp)
+			newRange := &ranger.Range{
 				LowExclude:  ranges[0].LowExclude,
 				LowVal:      []types.Datum{ranges[0].LowVal[0]},
-				HighVal:     []types.Datum{upper},
-				HighExclude: false})
-			ranges[0].LowVal[0] = upper
-			ranges[0].LowExclude = true
+				HighVal:     []types.Datum{lower},
+				HighExclude: true}
+			if validRange(sc, newRange, encoded) {
+				split = append(split, newRange)
+			}
+			ranges[0].LowVal[0] = lower
+			ranges[0].LowExclude = false
 			if !validRange(sc, ranges[0], encoded) {
 				ranges = ranges[1:]
 			}