Skip to content

Commit

Permalink
distinct: avoid floating-point computation in the Add step
Browse files Browse the repository at this point in the history
Instead of tracking the current eviction probability as floating-point, take
advantage of the fact that it always has the form 1/2^k and use an integer
threshold instead. This lets us avoid floating-point computations both for RNG
sampling and for updates.

While here, make the Count method return unsigned instead of signed, since the
value will never be negative (even in a case of undercount).
  • Loading branch information
creachadair committed Aug 14, 2024
1 parent ed06c2c commit 192e85d
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 14 deletions.
39 changes: 27 additions & 12 deletions distinct/distinct.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
crand "crypto/rand"
"fmt"
"math"
"math/bits"
"math/rand/v2"

"github.com/creachadair/mds/mapset"
Expand All @@ -21,9 +22,13 @@ import (
// obtain the current estimate of the number of distinct elements observed.
type Counter[T comparable] struct {
buf mapset.Set[T]
cap int // maximum allowed size of buf
p float64 // eviction probability
rng *rand.Rand
cap int // maximum allowed size of buf
p uint64 // eviction probability (see below)
rng rand.Source

// To avoid the need for floating-point calculations during update, we
// express the probability as a fixed-point threshold in 0..MaxUint64, where
// 0 denotes probability 0 and ~0 denotes probability 1.
}

// NewCounter constructs a new empty distinct-elements counter using a buffer
Expand All @@ -39,8 +44,8 @@ func NewCounter[T comparable](size int) *Counter[T] {
return &Counter[T]{
buf: make(mapset.Set[T]),
cap: size,
p: 1,
rng: rand.New(rand.NewChaCha8(seed)),
p: math.MaxUint64,
rng: rand.NewChaCha8(seed),
}
}

Expand All @@ -49,11 +54,11 @@ func (c *Counter[T]) Len() int { return c.buf.Len() }

// Reset resets c to its initial state, as if freshly constructed.
// The internal buffer size limit remains unchanged.
func (c *Counter[T]) Reset() { c.buf.Clear(); c.p = 1 }
func (c *Counter[T]) Reset() { c.buf.Clear(); c.p = math.MaxUint64 }

// Add adds v to the counter.
func (c *Counter[T]) Add(v T) {
if c.p < 1 && c.rng.Float64() >= c.p {
if c.p < math.MaxUint64 && c.rng.Uint64() >= c.p {
c.buf.Remove(v)
return
}
Expand All @@ -74,23 +79,33 @@ func (c *Counter[T]) Add(v T) {
rnd >>= 1
nb--
}
c.p /= 2
c.p >>= 1
}
}

// Count returns the current estimate of the number of distinct elements
// observed by the counter.
func (c *Counter[T]) Count() int64 { return int64(float64(c.buf.Len()) / c.p) }
func (c *Counter[T]) Count() uint64 {
// The estimate is |X| / p, where p = 1/2^k after k eviction passes.
// To convert our fixed-point probability, note that:
//
// |X| / p == |X| * (1/p) == |X| * 2^k
//
// The number of leading zeroes of c.p records k, so we can do this all in
// fixed-point arithmetic with no floating point conversion.
p2k := uint64(1) << uint64(bits.LeadingZeros64(c.p))
return uint64(c.buf.Len()) * p2k
}

// BufferSize returns a buffer size sufficient to ensure that a counter using
// this size will produce estimates within (1 ± ε) times the true count with
// probability (1 - δ), assuming the expected total number of elements to be
// counted is expSize.
//
// The suggested buffer size guarantees these constraints, but note that the
// estimate is very conservative. In practice, the actual estimates will
// usually be much more accurate. Empirically, values of ε and δ in the 0.05
// range work well.
// Chernoff bound estimate is very conservative. In practice, the actual
// estimates will usually be much more accurate. Empirically, values of ε and δ
// in the 0.05 range work well.
func BufferSize(ε, δ float64, expSize int) int {
if ε < 0 || ε > 1 {
panic(fmt.Sprintf("error bound out of range: %v", ε))
Expand Down
6 changes: 4 additions & 2 deletions distinct/distinct_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func TestCounter(t *testing.T) {
t.Logf("Estimated count: %d", c.Count())
t.Logf("Buffer size: %d", c.Len())

e := float64(c.Count()-int64(actual.Len())) / float64(actual.Len())
e := observedErrorRate(int(c.Count()), actual.Len())
t.Logf("Error: %.4g%%", 100*e)

if math.Abs(e) > *errRate {
Expand Down Expand Up @@ -94,7 +94,7 @@ func TestCounter(t *testing.T) {
var maxErr float64
for i := 0; i < 1_000_000; i += 500 {
actual.AddAll(fill(c, 500))
e := float64(c.Count()-int64(actual.Len())) / float64(actual.Len())
e := observedErrorRate(int(c.Count()), actual.Len())
if math.Abs(e) > math.Abs(maxErr) {
maxErr = e
t.Logf("At %d unique items, max error is %.4g%%", actual.Len(), 100*maxErr)
Expand All @@ -106,3 +106,5 @@ func TestCounter(t *testing.T) {
t.Logf("Max error: %.4g%%", 100*maxErr)
})
}

func observedErrorRate(got, want int) float64 { return float64(got-want) / float64(want) }

0 comments on commit 192e85d

Please sign in to comment.