Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve: counter overflow and precise lost #1478

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 60 additions & 10 deletions prometheus/counter.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,11 @@ type counter struct {
// in the struct to guarantee alignment for atomic operations.
// http://golang.org/pkg/sync/atomic/#pkg-note-BUG
valBits uint64
valInt uint64

// change is used to record the numbers which will lose if it adds with float64 directly,
// due to the rounding error of IEEE754 representation.
// we aggregate the values until it's large enough to conquer the rounding error
change uint64

selfCollector
desc *Desc
Expand All @@ -119,6 +123,20 @@ type counter struct {
now func() time.Time
}

// addWithRoundingErrorChecking adds the base and addend,
// and returns the result and checks whether the addend is too small that causes a rounding error.
//
// note that if a part of addend is added on base but the left causes a rounding error,
// we don't respect this case
func addWithRoundingErrorChecking(base, addend float64) (float64, bool) {
if addend == 0 {
return base, false
}

sum := base + addend
return sum, sum == base
}

func (c *counter) Desc() *Desc {
return c.desc
}
Expand All @@ -128,15 +146,42 @@ func (c *counter) Add(v float64) {
panic(errors.New("counter cannot decrease in value"))
}

ival := uint64(v)
if float64(ival) == v {
atomic.AddUint64(&c.valInt, ival)
return
}

for {
oldBits := atomic.LoadUint64(&c.valBits)
newBits := math.Float64bits(math.Float64frombits(oldBits) + v)

u := uint64(v)

// not the full value of v will be added into the float as a part of them might have rounding
// error as well, we don't respect this case(we don't have a proper way to handle it).
newF, hasRoundingErr := addWithRoundingErrorChecking(math.Float64frombits(oldBits), v)
if hasRoundingErr && v == float64(u) {
// we believe the float64(uint64(v)) is equal to v if it's an integer
// because it causes a rounding error,
// it doesn't equal only when v is a quite large number or it's a float

oldChange := atomic.LoadUint64(&c.change)
newF, isChangeSmall := addWithRoundingErrorChecking(math.Float64frombits(oldBits), float64(oldChange+u))

if isChangeSmall {
if atomic.CompareAndSwapUint64(&c.change, oldChange, oldChange+u) {
return
}
continue
}
newBits := math.Float64bits(newF)
if atomic.CompareAndSwapUint64(&c.valBits, oldBits, newBits) {
// todo: here we might lose some small values, but it's acceptable
// otherwise we have no way to avoid this using atomic
// mutex might be too heavy for our case here
atomic.StoreUint64(&c.change, 0)
return
}
// fail to update the change as another go routine changes it
continue
}

// we have no idea about the precision losing for float number
newBits := math.Float64bits(newF)
if atomic.CompareAndSwapUint64(&c.valBits, oldBits, newBits) {
return
}
Expand All @@ -149,12 +194,17 @@ func (c *counter) AddWithExemplar(v float64, e Labels) {
}

func (c *counter) Inc() {
atomic.AddUint64(&c.valInt, 1)
// to keep the efficiency, we still increase the change,
// and ignore the rare overflow case as only Inc and float with rounding error will use it
atomic.AddUint64(&c.change, 1)
}

func (c *counter) get() float64 {
fval := math.Float64frombits(atomic.LoadUint64(&c.valBits))
ival := atomic.LoadUint64(&c.valInt)
ival := atomic.LoadUint64(&c.change)
// it's tolerated to lose precision for float during collection
// as this is unavoidable.
// what the client could do is try to keep those data for the future when losing precision
return fval + float64(ival)
}

Expand Down
216 changes: 206 additions & 10 deletions prometheus/counter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,22 +38,22 @@ func TestCounterAdd(t *testing.T) {
if expected, got := 0.0, math.Float64frombits(counter.valBits); expected != got {
t.Errorf("Expected %f, got %f.", expected, got)
}
if expected, got := uint64(1), counter.valInt; expected != got {
if expected, got := uint64(1), counter.change; expected != got {
t.Errorf("Expected %d, got %d.", expected, got)
}
counter.Add(42)
if expected, got := 0.0, math.Float64frombits(counter.valBits); expected != got {
if expected, got := 42.0, math.Float64frombits(counter.valBits); expected != got {
t.Errorf("Expected %f, got %f.", expected, got)
}
if expected, got := uint64(43), counter.valInt; expected != got {
if expected, got := uint64(1), counter.change; expected != got {
t.Errorf("Expected %d, got %d.", expected, got)
}

counter.Add(24.42)
if expected, got := 24.42, math.Float64frombits(counter.valBits); expected != got {
if expected, got := 42+24.42, math.Float64frombits(counter.valBits); expected != got {
t.Errorf("Expected %f, got %f.", expected, got)
}
if expected, got := uint64(43), counter.valInt; expected != got {
if expected, got := uint64(1), counter.change; expected != got {
t.Errorf("Expected %d, got %d.", expected, got)
}

Expand All @@ -79,6 +79,114 @@ func TestCounterAdd(t *testing.T) {
}
}

func TestCounterOverflowAdd(t *testing.T) {
now := time.Now()

counter := NewCounter(CounterOpts{
Name: "test",
Help: "test help",
ConstLabels: Labels{"a": "1", "b": "2"},
now: func() time.Time { return now },
}).(*counter)
counter.change = math.MaxUint64
counter.Inc()

// this is expected due to the overflow
if expected, got := uint64(0), counter.change; expected != got {
t.Errorf("Expected %d, got %d.", expected, got)
}

m := &dto.Metric{}
counter.Write(m)

expected := &dto.Metric{
Label: []*dto.LabelPair{
{Name: proto.String("a"), Value: proto.String("1")},
{Name: proto.String("b"), Value: proto.String("2")},
},
Counter: &dto.Counter{
Value: proto.Float64(0),
CreatedTimestamp: timestamppb.New(now),
},
}
if !proto.Equal(expected, m) {
t.Errorf("expected %q, got %q", expected, m)
}
}

func TestCounterPrecisionLosingAdd(t *testing.T) {
now := time.Now()

counter := NewCounter(CounterOpts{
Name: "test",
Help: "test help",
ConstLabels: Labels{"a": "1", "b": "2"},
now: func() time.Time { return now },
}).(*counter)

counter.Add(1 << 53)
if expected, got := float64(1<<53), math.Float64frombits(counter.valBits); expected != got {
t.Errorf("Expected %f, got %f.", expected, got)
}

counter.Inc()
if expected, got := uint64(1), counter.change; expected != got {
t.Errorf("Expected %d, got %d.", expected, got)
}

// along with the value 1 inside change, the sum could be added into the float without losing precision
counter.Add(1)
if expected, got := float64(1<<53+2), math.Float64frombits(counter.valBits); expected != got {
t.Errorf("Expected %f, got %f.", expected, got)
}
// the change is reset to 0
if expected, got := uint64(0), counter.change; expected != got {
t.Errorf("Expected %d, got %d.", expected, got)
}

// the value will flip with 2
counter.Add(1)
if expected, got := float64(1<<53+4), math.Float64frombits(counter.valBits); expected != got {
t.Errorf("Expected %f, got %f.", expected, got)
}
if expected, got := uint64(0), counter.change; expected != got {
t.Errorf("Expected %d, got %d.", expected, got)
}

counter.Add(1)
if expected, got := float64(1<<53+4), math.Float64frombits(counter.valBits); expected != got {
t.Errorf("Expected %f, got %f.", expected, got)
}
if expected, got := uint64(1), counter.change; expected != got {
t.Errorf("Expected %d, got %d.", expected, got)
}

counter.Add(1)
if expected, got := float64(1<<53+6), math.Float64frombits(counter.valBits); expected != got {
t.Errorf("Expected %f, got %f.", expected, got)
}
if expected, got := uint64(0), counter.change; expected != got {
t.Errorf("Expected %d, got %d.", expected, got)
}

m := &dto.Metric{}
counter.Write(m)

expected := &dto.Metric{
Label: []*dto.LabelPair{
{Name: proto.String("a"), Value: proto.String("1")},
{Name: proto.String("b"), Value: proto.String("2")},
},
Counter: &dto.Counter{
Value: proto.Float64(1<<53 + 6),
CreatedTimestamp: timestamppb.New(now),
},
}
if !proto.Equal(expected, m) {
t.Errorf("expected %q, got %q", expected, m)
}
}

func decreaseCounter(c *counter) (err error) {
defer func() {
if e := recover(); e != nil {
Expand Down Expand Up @@ -157,23 +265,23 @@ func TestCounterAddInf(t *testing.T) {
if expected, got := 0.0, math.Float64frombits(counter.valBits); expected != got {
t.Errorf("Expected %f, got %f.", expected, got)
}
if expected, got := uint64(1), counter.valInt; expected != got {
if expected, got := uint64(1), counter.change; expected != got {
t.Errorf("Expected %d, got %d.", expected, got)
}

counter.Add(math.Inf(1))
if expected, got := math.Inf(1), math.Float64frombits(counter.valBits); expected != got {
t.Errorf("valBits expected %f, got %f.", expected, got)
}
if expected, got := uint64(1), counter.valInt; expected != got {
if expected, got := uint64(1), counter.change; expected != got {
t.Errorf("valInts expected %d, got %d.", expected, got)
}

counter.Inc()
if expected, got := math.Inf(1), math.Float64frombits(counter.valBits); expected != got {
t.Errorf("Expected %f, got %f.", expected, got)
}
if expected, got := uint64(2), counter.valInt; expected != got {
if expected, got := uint64(2), counter.change; expected != got {
t.Errorf("Expected %d, got %d.", expected, got)
}

Expand Down Expand Up @@ -207,7 +315,7 @@ func TestCounterAddLarge(t *testing.T) {
if expected, got := large, math.Float64frombits(counter.valBits); expected != got {
t.Errorf("valBits expected %f, got %f.", expected, got)
}
if expected, got := uint64(0), counter.valInt; expected != got {
if expected, got := uint64(0), counter.change; expected != got {
t.Errorf("valInts expected %d, got %d.", expected, got)
}

Expand Down Expand Up @@ -240,7 +348,7 @@ func TestCounterAddSmall(t *testing.T) {
if expected, got := small, math.Float64frombits(counter.valBits); expected != got {
t.Errorf("valBits expected %f, got %f.", expected, got)
}
if expected, got := uint64(0), counter.valInt; expected != got {
if expected, got := uint64(0), counter.change; expected != got {
t.Errorf("valInts expected %d, got %d.", expected, got)
}

Expand Down Expand Up @@ -386,3 +494,91 @@ func expectCTsForMetricVecValues(t testing.TB, vec *MetricVec, typ dto.MetricTyp
}
}
}

func Test_hasRoundingError(t *testing.T) {
// for reviewing, could use this online tool to convert decimal
// https://baseconvert.com/ieee-754-floating-point
tests := []struct {
name string
base float64
delta float64
wantRoundingError bool
wantNumber float64
}{
{
name: "no rounding error",
base: 0,
delta: 1,
wantRoundingError: false,
wantNumber: 1,
},
{
name: "no rounding error",
base: 1<<53 - 1,
delta: 1,
wantRoundingError: false,
wantNumber: 1 << 53,
},
{
name: "rounding error",
base: 1 << 53,
delta: 1,
wantRoundingError: true,
wantNumber: 1 << 53,
},
{
name: "no rounding error",
base: 1 << 53,
delta: 2,
wantRoundingError: true,
wantNumber: 1<<53 + 2,
},
{
name: "no rounding error",
base: 1<<53 + 2,
delta: 1,
wantRoundingError: false,
// there is a precision losing
wantNumber: 1<<53 + 4,
},
{
name: "rounding error",
base: 1 << 54,
delta: 1,
wantRoundingError: true,
wantNumber: 1 << 54,
},
{
name: "rounding error",
base: 1 << 54,
delta: 3,
wantRoundingError: false,
wantNumber: 18014398509481988,
},
{
name: "rounding error",
base: 1 << 64,
delta: 1000,
wantRoundingError: true,
wantNumber: 1 << 64,
},
{
name: "no rounding error",
base: 1 << 64,
delta: 10000,
wantRoundingError: false,
wantNumber: 18446744073709559808,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
expectNumber, hasRoundingErr := addWithRoundingErrorChecking(tt.base, tt.delta)
if expectNumber != tt.wantNumber {
t.Errorf("addWithRoundingErrorChecking() = %f, wantRoundingError %f", expectNumber, tt.wantNumber)
}
if hasRoundingErr != tt.wantRoundingError {
t.Errorf("addWithRoundingErrorChecking() = %v, wantRoundingError %v", hasRoundingErr, tt.wantRoundingError)
}
})
}
}
Loading