prometheus · xieyuschen · Mar 27, 2024 · Mar 27, 2024
diff --git a/prometheus/counter.go b/prometheus/counter.go
@@ -106,7 +106,11 @@ type counter struct {
 	// in the struct to guarantee alignment for atomic operations.
 	// http://golang.org/pkg/sync/atomic/#pkg-note-BUG
 	valBits uint64
-	valInt  uint64
+
+	// change is used to record the numbers which will lose if it adds with float64 directly,
+	// due to the rounding error of IEEE754 representation.
+	// we aggregate the values until it's large enough to conquer the rounding error
+	change uint64
 
 	selfCollector
 	desc *Desc
@@ -119,6 +123,20 @@ type counter struct {
 	now func() time.Time
 }
 
+// addWithRoundingErrorChecking adds the base and addend,
+// and returns the result and checks whether the addend is too small that causes a rounding error.
+//
+// note that if a part of addend is added on base but the left causes a rounding error,
+// we don't respect this case
+func addWithRoundingErrorChecking(base, addend float64) (float64, bool) {
+	if addend == 0 {
+		return base, false
+	}
+
+	sum := base + addend
+	return sum, sum == base
+}
+
 func (c *counter) Desc() *Desc {
 	return c.desc
 }
@@ -128,15 +146,42 @@ func (c *counter) Add(v float64) {
 		panic(errors.New("counter cannot decrease in value"))
 	}
 
-	ival := uint64(v)
-	if float64(ival) == v {
-		atomic.AddUint64(&c.valInt, ival)
-		return
-	}
-
 	for {
 		oldBits := atomic.LoadUint64(&c.valBits)
-		newBits := math.Float64bits(math.Float64frombits(oldBits) + v)
+
+		u := uint64(v)
+
+		// not the full value of v will be added into the float as a part of them might have rounding
+		// error as well, we don't respect this case(we don't have a proper way to handle it).
+		newF, hasRoundingErr := addWithRoundingErrorChecking(math.Float64frombits(oldBits), v)
+		if hasRoundingErr && v == float64(u) {
+			// we believe the float64(uint64(v)) is equal to v if it's an integer
+			// because it causes a rounding error,
+			// it doesn't equal only when v is a quite large number or it's a float
+
+			oldChange := atomic.LoadUint64(&c.change)
+			newF, isChangeSmall := addWithRoundingErrorChecking(math.Float64frombits(oldBits), float64(oldChange+u))
+
+			if isChangeSmall {
+				if atomic.CompareAndSwapUint64(&c.change, oldChange, oldChange+u) {
+					return
+				}
+				continue
+			}
+			newBits := math.Float64bits(newF)
+			if atomic.CompareAndSwapUint64(&c.valBits, oldBits, newBits) {
+				// todo: here we might lose some small values, but it's acceptable
+				// otherwise we have no way to avoid this using atomic
+				// mutex might be too heavy for our case here
+				atomic.StoreUint64(&c.change, 0)
+				return
+			}
+			// fail to update the change as another go routine changes it
+			continue
+		}
+
+		// we have no idea about the precision losing for float number
+		newBits := math.Float64bits(newF)
 		if atomic.CompareAndSwapUint64(&c.valBits, oldBits, newBits) {
 			return
 		}
@@ -149,12 +194,17 @@ func (c *counter) AddWithExemplar(v float64, e Labels) {
 }
 
 func (c *counter) Inc() {
-	atomic.AddUint64(&c.valInt, 1)
+	// to keep the efficiency, we still increase the change,
+	// and ignore the rare overflow case as only Inc and float with rounding error will use it
+	atomic.AddUint64(&c.change, 1)
 }
 
 func (c *counter) get() float64 {
 	fval := math.Float64frombits(atomic.LoadUint64(&c.valBits))
-	ival := atomic.LoadUint64(&c.valInt)
+	ival := atomic.LoadUint64(&c.change)
+	// it's tolerated to lose precision for float during collection
+	// as this is unavoidable.
+	// what the client could do is try to keep those data for the future when losing precision
 	return fval + float64(ival)
 }
 

diff --git a/prometheus/counter_test.go b/prometheus/counter_test.go
@@ -38,22 +38,22 @@ func TestCounterAdd(t *testing.T) {
 	if expected, got := 0.0, math.Float64frombits(counter.valBits); expected != got {
 		t.Errorf("Expected %f, got %f.", expected, got)
 	}
-	if expected, got := uint64(1), counter.valInt; expected != got {
+	if expected, got := uint64(1), counter.change; expected != got {
 		t.Errorf("Expected %d, got %d.", expected, got)
 	}
 	counter.Add(42)
-	if expected, got := 0.0, math.Float64frombits(counter.valBits); expected != got {
+	if expected, got := 42.0, math.Float64frombits(counter.valBits); expected != got {
 		t.Errorf("Expected %f, got %f.", expected, got)
 	}
-	if expected, got := uint64(43), counter.valInt; expected != got {
+	if expected, got := uint64(1), counter.change; expected != got {
 		t.Errorf("Expected %d, got %d.", expected, got)
 	}
 
 	counter.Add(24.42)
-	if expected, got := 24.42, math.Float64frombits(counter.valBits); expected != got {
+	if expected, got := 42+24.42, math.Float64frombits(counter.valBits); expected != got {
 		t.Errorf("Expected %f, got %f.", expected, got)
 	}
-	if expected, got := uint64(43), counter.valInt; expected != got {
+	if expected, got := uint64(1), counter.change; expected != got {
 		t.Errorf("Expected %d, got %d.", expected, got)
 	}
 
@@ -79,6 +79,114 @@ func TestCounterAdd(t *testing.T) {
 	}
 }
 
+func TestCounterOverflowAdd(t *testing.T) {
+	now := time.Now()
+
+	counter := NewCounter(CounterOpts{
+		Name:        "test",
+		Help:        "test help",
+		ConstLabels: Labels{"a": "1", "b": "2"},
+		now:         func() time.Time { return now },
+	}).(*counter)
+	counter.change = math.MaxUint64
+	counter.Inc()
+
+	// this is expected due to the overflow
+	if expected, got := uint64(0), counter.change; expected != got {
+		t.Errorf("Expected %d, got %d.", expected, got)
+	}
+
+	m := &dto.Metric{}
+	counter.Write(m)
+
+	expected := &dto.Metric{
+		Label: []*dto.LabelPair{
+			{Name: proto.String("a"), Value: proto.String("1")},
+			{Name: proto.String("b"), Value: proto.String("2")},
+		},
+		Counter: &dto.Counter{
+			Value:            proto.Float64(0),
+			CreatedTimestamp: timestamppb.New(now),
+		},
+	}
+	if !proto.Equal(expected, m) {
+		t.Errorf("expected %q, got %q", expected, m)
+	}
+}
+
+func TestCounterPrecisionLosingAdd(t *testing.T) {
+	now := time.Now()
+
+	counter := NewCounter(CounterOpts{
+		Name:        "test",
+		Help:        "test help",
+		ConstLabels: Labels{"a": "1", "b": "2"},
+		now:         func() time.Time { return now },
+	}).(*counter)
+
+	counter.Add(1 << 53)
+	if expected, got := float64(1<<53), math.Float64frombits(counter.valBits); expected != got {
+		t.Errorf("Expected %f, got %f.", expected, got)
+	}
+
+	counter.Inc()
+	if expected, got := uint64(1), counter.change; expected != got {
+		t.Errorf("Expected %d, got %d.", expected, got)
+	}
+
+	// along with the value 1 inside change, the sum could be added into the float without losing precision
+	counter.Add(1)
+	if expected, got := float64(1<<53+2), math.Float64frombits(counter.valBits); expected != got {
+		t.Errorf("Expected %f, got %f.", expected, got)
+	}
+	// the change is reset to 0
+	if expected, got := uint64(0), counter.change; expected != got {
+		t.Errorf("Expected %d, got %d.", expected, got)
+	}
+
+	// the value will flip with 2
+	counter.Add(1)
+	if expected, got := float64(1<<53+4), math.Float64frombits(counter.valBits); expected != got {
+		t.Errorf("Expected %f, got %f.", expected, got)
+	}
+	if expected, got := uint64(0), counter.change; expected != got {
+		t.Errorf("Expected %d, got %d.", expected, got)
+	}
+
+	counter.Add(1)
+	if expected, got := float64(1<<53+4), math.Float64frombits(counter.valBits); expected != got {
+		t.Errorf("Expected %f, got %f.", expected, got)
+	}
+	if expected, got := uint64(1), counter.change; expected != got {
+		t.Errorf("Expected %d, got %d.", expected, got)
+	}
+
+	counter.Add(1)
+	if expected, got := float64(1<<53+6), math.Float64frombits(counter.valBits); expected != got {
+		t.Errorf("Expected %f, got %f.", expected, got)
+	}
+	if expected, got := uint64(0), counter.change; expected != got {
+		t.Errorf("Expected %d, got %d.", expected, got)
+	}
+
+	m := &dto.Metric{}
+	counter.Write(m)
+
+	expected := &dto.Metric{
+		Label: []*dto.LabelPair{
+			{Name: proto.String("a"), Value: proto.String("1")},
+			{Name: proto.String("b"), Value: proto.String("2")},
+		},
+		Counter: &dto.Counter{
+			Value:            proto.Float64(1<<53 + 6),
+			CreatedTimestamp: timestamppb.New(now),
+		},
+	}
+	if !proto.Equal(expected, m) {
+		t.Errorf("expected %q, got %q", expected, m)
+	}
+}
+
 func decreaseCounter(c *counter) (err error) {
 	defer func() {
 		if e := recover(); e != nil {
@@ -157,23 +265,23 @@ func TestCounterAddInf(t *testing.T) {
 	if expected, got := 0.0, math.Float64frombits(counter.valBits); expected != got {
 		t.Errorf("Expected %f, got %f.", expected, got)
 	}
-	if expected, got := uint64(1), counter.valInt; expected != got {
+	if expected, got := uint64(1), counter.change; expected != got {
 		t.Errorf("Expected %d, got %d.", expected, got)
 	}
 
 	counter.Add(math.Inf(1))
 	if expected, got := math.Inf(1), math.Float64frombits(counter.valBits); expected != got {
 		t.Errorf("valBits expected %f, got %f.", expected, got)
 	}
-	if expected, got := uint64(1), counter.valInt; expected != got {
+	if expected, got := uint64(1), counter.change; expected != got {
 		t.Errorf("valInts expected %d, got %d.", expected, got)
 	}
 
 	counter.Inc()
 	if expected, got := math.Inf(1), math.Float64frombits(counter.valBits); expected != got {
 		t.Errorf("Expected %f, got %f.", expected, got)
 	}
-	if expected, got := uint64(2), counter.valInt; expected != got {
+	if expected, got := uint64(2), counter.change; expected != got {
 		t.Errorf("Expected %d, got %d.", expected, got)
 	}
 
@@ -207,7 +315,7 @@ func TestCounterAddLarge(t *testing.T) {
 	if expected, got := large, math.Float64frombits(counter.valBits); expected != got {
 		t.Errorf("valBits expected %f, got %f.", expected, got)
 	}
-	if expected, got := uint64(0), counter.valInt; expected != got {
+	if expected, got := uint64(0), counter.change; expected != got {
 		t.Errorf("valInts expected %d, got %d.", expected, got)
 	}
 
@@ -240,7 +348,7 @@ func TestCounterAddSmall(t *testing.T) {
 	if expected, got := small, math.Float64frombits(counter.valBits); expected != got {
 		t.Errorf("valBits expected %f, got %f.", expected, got)
 	}
-	if expected, got := uint64(0), counter.valInt; expected != got {
+	if expected, got := uint64(0), counter.change; expected != got {
 		t.Errorf("valInts expected %d, got %d.", expected, got)
 	}
 
@@ -386,3 +494,91 @@ func expectCTsForMetricVecValues(t testing.TB, vec *MetricVec, typ dto.MetricTyp
 		}
 	}
 }
+
+func Test_hasRoundingError(t *testing.T) {
+	// for reviewing, could use this online tool to convert decimal
+	// https://baseconvert.com/ieee-754-floating-point
+	tests := []struct {
+		name              string
+		base              float64
+		delta             float64
+		wantRoundingError bool
+		wantNumber        float64
+	}{
+		{
+			name:              "no rounding error",
+			base:              0,
+			delta:             1,
+			wantRoundingError: false,
+			wantNumber:        1,
+		},
+		{
+			name:              "no rounding error",
+			base:              1<<53 - 1,
+			delta:             1,
+			wantRoundingError: false,
+			wantNumber:        1 << 53,
+		},
+		{
+			name:              "rounding error",
+			base:              1 << 53,
+			delta:             1,
+			wantRoundingError: true,
+			wantNumber:        1 << 53,
+		},
+		{
+			name:              "no rounding error",
+			base:              1 << 53,
+			delta:             2,
+			wantRoundingError: true,
+			wantNumber:        1<<53 + 2,
+		},
+		{
+			name:              "no rounding error",
+			base:              1<<53 + 2,
+			delta:             1,
+			wantRoundingError: false,
+			// there is a precision losing
+			wantNumber: 1<<53 + 4,
+		},
+		{
+			name:              "rounding error",
+			base:              1 << 54,
+			delta:             1,
+			wantRoundingError: true,
+			wantNumber:        1 << 54,
+		},
+		{
+			name:              "rounding error",
+			base:              1 << 54,
+			delta:             3,
+			wantRoundingError: false,
+			wantNumber:        18014398509481988,
+		},
+		{
+			name:              "rounding error",
+			base:              1 << 64,
+			delta:             1000,
+			wantRoundingError: true,
+			wantNumber:        1 << 64,
+		},
+		{
+			name:              "no rounding error",
+			base:              1 << 64,
+			delta:             10000,
+			wantRoundingError: false,
+			wantNumber:        18446744073709559808,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			expectNumber, hasRoundingErr := addWithRoundingErrorChecking(tt.base, tt.delta)
+			if expectNumber != tt.wantNumber {
+				t.Errorf("addWithRoundingErrorChecking() = %f, wantRoundingError %f", expectNumber, tt.wantNumber)
+			}
+			if hasRoundingErr != tt.wantRoundingError {
+				t.Errorf("addWithRoundingErrorChecking() = %v, wantRoundingError %v", hasRoundingErr, tt.wantRoundingError)
+			}
+		})
+	}
+}