diff --git a/fastbloom-rs/src/bloom.rs b/fastbloom-rs/src/bloom.rs index a03f475..29f4b80 100644 --- a/fastbloom-rs/src/bloom.rs +++ b/fastbloom-rs/src/bloom.rs @@ -372,6 +372,12 @@ impl BloomFilter { self.bit_set.is_empty() } + /// Returns estimated cardinality of the set + /// see [Scalable and Efficient Privacy Preserving Global Itemset Support Approximation Using Bloom Filters](https://inria.hal.science/hal-01284874/document) as reference + pub fn estimate_set_cardinality(&self) -> f64 { + (self.bit_set.count_zeros() as f64 / self.config.size as f64).ln() / (self.hashes() as f64 * (1.0 - 1.0/self.config.size as f64).ln()) + } + pub(crate) fn set_bit_vec(&mut self, bit_vec: BloomBitVec) { assert_eq!(self.config.size, bit_vec.nbits as u64); self.bit_set = bit_vec diff --git a/fastbloom-rs/src/vec.rs b/fastbloom-rs/src/vec.rs index 783851f..e8e3c2a 100644 --- a/fastbloom-rs/src/vec.rs +++ b/fastbloom-rs/src/vec.rs @@ -96,6 +96,9 @@ impl BloomBitVec { } } + pub fn count_zeros(&self)->u32 { + self.storage.iter().fold(0, |acc, x| acc + x.count_zeros()) + } pub fn clear(&mut self) { self.storage.fill(0); @@ -222,4 +225,17 @@ fn test_count_vec() { vec.increment(7); assert_eq!(1, vec.get(7)) +} + +#[test] +fn test_count_zeros() { + let mut vec = BloomBitVec::new(4); + vec.set(37); + vec.set(30); + vec.set(38); + println!("{:?}", vec); + #[cfg(target_pointer_width = "64")] + assert_eq!(vec.count_zeros(), 253); + #[cfg(target_pointer_width = "32")] + assert_eq!(vec.count_zeros(), 125); } \ No newline at end of file diff --git a/fastbloom_rs/fastbloom_rs.abi3.so b/fastbloom_rs/fastbloom_rs.abi3.so new file mode 100755 index 0000000..6ebb301 Binary files /dev/null and b/fastbloom_rs/fastbloom_rs.abi3.so differ diff --git a/fastbloom_rs/fastbloom_rs.pyi b/fastbloom_rs/fastbloom_rs.pyi index c3db229..7f23a70 100644 --- a/fastbloom_rs/fastbloom_rs.pyi +++ b/fastbloom_rs/fastbloom_rs.pyi @@ -99,6 +99,9 @@ class PyBloomFilter(object): def clear(self): ... + def estimate_set_cardinality(self): + ... + def get_hash_indices(self, element: bytes) -> Sequence[int]: ... diff --git a/fastbloom_rs/filter.py b/fastbloom_rs/filter.py index 481d8eb..52896b0 100644 --- a/fastbloom_rs/filter.py +++ b/fastbloom_rs/filter.py @@ -327,6 +327,14 @@ def is_empty(self) -> bool: :return: """ return self._py_bloom.is_empty() + + def estimate_set_cardinality(self) -> float: + """ + Returns Returns estimated cardinality of the set + + :return: + """ + return self._py_bloom.estimate_set_cardinality() def union(self, other: "BloomFilter") -> bool: """ diff --git a/py_tests/test_bloom.py b/py_tests/test_bloom.py index 83400ad..caccedb 100644 --- a/py_tests/test_bloom.py +++ b/py_tests/test_bloom.py @@ -62,6 +62,14 @@ def test_bloom_add(): assert not ('hello' in bloom) +def test_bloom_estimate_set_cardinality(): + bloom = BloomFilter(100_000_000, 0.01) + for data in range(0, 10_000_000): + bloom.add_int(data) + + assert (bloom.estimate_set_cardinality() < 10_100_000) and (bloom.estimate_set_cardinality() > 9_900_000) + + def test_bloom_op(): bloom = BloomFilter(100_000_000, 0.001) bloom.add_bytes(b'hello') diff --git a/src/pybloom.rs b/src/pybloom.rs index 569253c..74f3238 100644 --- a/src/pybloom.rs +++ b/src/pybloom.rs @@ -167,6 +167,10 @@ impl PyBloomFilter { Ok(self.bloomfilter.is_empty()) } + pub fn estimate_set_cardinality(&self) -> PyResult { + Ok(self.bloomfilter.estimate_set_cardinality()) + } + pub fn union(&mut self, other: &PyBloomFilter) -> PyResult { Ok(self.bloomfilter.union(&other.bloomfilter)) }