-
Notifications
You must be signed in to change notification settings - Fork 5.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
stats: refine updating stats using feedback #6796
Conversation
statistics/feedback.go
Outdated
} | ||
return count | ||
} | ||
|
||
// Get the split count for the histogram. | ||
func getSplitCount(count, remainBuckets int) int { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about numFeedbacks
instead of count
?
statistics/feedback.go
Outdated
} | ||
return int64(count) | ||
} | ||
|
||
// updateBucket split the bucket according to feedback. | ||
func (b *BucketFeedback) splitBucket(newBktNum int, totalCount float64, count float64) []bucket { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
comment and method name not match.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The count
should be renamed to originBucketCount
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/newBktNum/newNumBkts
fbLower, fbUpper = getFraction4PK(minValue, maxValue, fb.lower, fb.upper) | ||
bktLower, bktUpper = getFraction4PK(minValue, maxValue, bkt.lower, bkt.upper) | ||
} | ||
ratio := (bktUpper - bktLower) / (fbUpper - fbLower) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The ratio seems not consistent with the comment, should be (fbUpper - fbLower) /(bktUpper - bktLower)
?
And later we calculate the new bucket count with count * ratio
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the comment is wrong. The count
is the feedback count, so this ratio is right.
statistics/feedback.go
Outdated
return overlap, ratio | ||
} | ||
|
||
func (b *BucketFeedback) bucketCount(bkt bucket, defaultCount float64) float64 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about name it refineBucketCount
?
And add some comment about this method.
statistics/feedback.go
Outdated
@@ -425,31 +447,18 @@ func mergeBuckets(bkts []bucket, isNewBuckets []bool, totalCount float64) []buck | |||
|
|||
func splitBuckets(h *Histogram, feedback *QueryFeedback) ([]bucket, []bool, int64) { | |||
bktID2FB, fbNum := buildBucketFeedback(h, feedback) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/fbNum/totalNumFBs
statistics/feedback.go
Outdated
@@ -425,31 +447,18 @@ func mergeBuckets(bkts []bucket, isNewBuckets []bool, totalCount float64) []buck | |||
|
|||
func splitBuckets(h *Histogram, feedback *QueryFeedback) ([]bucket, []bool, int64) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add comments about this function.
statistics/feedback.go
Outdated
} | ||
return count | ||
} | ||
|
||
// Get the split count for the histogram. | ||
func getSplitCount(count, remainBuckets int) int { | ||
remainBuckets = mathutil.Max(remainBuckets, 10) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
better use another variable like splitCount
rather than modify the input argument.
statistics/feedback.go
Outdated
isNewBuckets = append(isNewBuckets, false) | ||
continue | ||
} | ||
bkts := bkt.splitBucket(splitCount*len(bkt.feedback)/fbNum, float64(totCount), float64(counts[i])) | ||
bkts := bkt.splitBucket(splitCount*len(bkt.feedback)/fbNum, h.totalRowCount(), float64(h.bucketCount(i))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need a dedicated variable for splitCount*len(bkt.feedback)/fbNum
and add a comment to explain why.
statistics/feedback.go
Outdated
} | ||
return count | ||
} | ||
|
||
// Get the split count for the histogram. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to explain why choose this algorithm.
LGTM |
statistics/feedback.go
Outdated
isNewBuckets = append(isNewBuckets, false) | ||
continue | ||
} | ||
bkts := bkt.splitBucket(splitCount*len(bkt.feedback)/fbNum, float64(totCount), float64(counts[i])) | ||
// distribute the total split count to bucket based on number of bucket feedback |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
distribute
-> Distribute
.
Add .
at the end.
} | ||
var fbLower, fbUpper, bktLower, bktUpper float64 | ||
minValue, maxValue := &datums[0], &datums[3] | ||
if datums[0].Kind() == types.KindBytes { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about add comment at the declaration of bucket
to tell that the datum in bucket can only be Bytes
or Int
, and Bytes
is for index and Int
is for int pk.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
statistics/feedback.go
Outdated
isNewBuckets = append(isNewBuckets, false) | ||
continue | ||
} | ||
bkts := bkt.splitBucket(splitCount*len(bkt.feedback)/fbNum, float64(totCount), float64(counts[i])) | ||
// distribute the total split count to bucket based on number of bucket feedback | ||
newBktNums := splitCount * len(bkt.feedback) / totalNumFBs |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about:
- s/totalNumFBs/numTotalFBs/
- s/bkt/bktFB/
numFBs := len(bktFB.feedback)
numNewBkts := splitCount * numFBs/numTotalFBs
statistics/feedback.go
Outdated
bkt := bucket{&bounds[i-1], bounds[i].Copy(), 0, 0} | ||
// get bucket count | ||
_, ratio := getOverlapFraction(feedback{b.lower, b.upper, int64(originBucketCount), 0}, bkt) | ||
bucketCount := originBucketCount * ratio |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about:
- s/bkts/newBkts/
- s/bkt/newBkt/
- s/bucketCount/countInNewBkt/
statistics/feedback.go
Outdated
bkt := bucket{lower: b.lower, upper: b.upper, count: int64(count)} | ||
return []bucket{bkt} | ||
} | ||
// splitBucket split the bucket according to feedback. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about changing this comment to:
// splitBucket firstly splits this "BucketFeedback" to "newNumBkts" new buckets,
// calculates the count for each new bucket, merge the new bucket whose count
// is smaller than "minBucketFraction*totalCountwith" with the next new bucket
// until the last new bucket.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
/run-all-tests |
What have you changed? (mandatory)
In the before, we first update the bucket's count, then split the bucket, this PR reverse these two steps, because:
In the before, when the feedback only covers a very small fraction of the bucket, we are not able to update the stats because we limit the feedback fraction, this may happen when the distribution is too discrete. In this PR, we are able to do it, because we first split the bucket, then calculate the new bucket's count, and only check if the bucket is too small at last.
We can have more accurate bucket count info after the change, since the more the feedback and bucket overlap, the more accuracy we can have. This brings the benefit that we can have better stats in fewer rounds of updating.
What are the type of the changes (mandatory)?
How has this PR been tested (mandatory)?
unit test