From 3d57d80f41314b24960b5fe10968dbdd5a970650 Mon Sep 17 00:00:00 2001 From: yankun <1939810907@qq.com> Date: Mon, 4 Dec 2023 21:19:38 +0800 Subject: [PATCH] feat: add from_file API --- Cargo.toml | 2 +- build.sc | 2 +- fastbloom-rs/Cargo.toml | 2 +- fastbloom-rs/src/bloom.rs | 113 ++++++++++++++++++++++++++++++++- fastbloom-rs/src/vec.rs | 35 +++++++++- fastbloom_rs/fastbloom_rs.pyi | 14 ++++ fastbloom_rs/filter.py | 30 +++++++++ fastbloomjvm/native/Cargo.toml | 2 +- py_tests/test_bloom.py | 60 +++++++++++++++-- src/pybloom.rs | 18 ++++++ 10 files changed, 265 insertions(+), 13 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 03af28e..1635e72 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fastbloom_rs" -version = "0.5.8" +version = "0.5.9" edition = "2021" authors = ["Yan Kun "] description = "Some fast bloom filter implemented by Rust for Python and Rust! 10x faster than pybloom!" diff --git a/build.sc b/build.sc index c0576d5..67dace4 100644 --- a/build.sc +++ b/build.sc @@ -19,7 +19,7 @@ object ProjectInfo { def author = Seq("Yan Kun ") - def version = "0.5.8" + def version = "0.5.9" def buildTool = "mill" diff --git a/fastbloom-rs/Cargo.toml b/fastbloom-rs/Cargo.toml index 0888686..690bdd7 100644 --- a/fastbloom-rs/Cargo.toml +++ b/fastbloom-rs/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fastbloom-rs" -version = "0.5.8" +version = "0.5.9" edition = "2021" authors = ["Yan Kun "] description = "Some fast bloom filter implemented by Rust for Python and Rust!" diff --git a/fastbloom-rs/src/bloom.rs b/fastbloom-rs/src/bloom.rs index 29f4b80..0ea5674 100644 --- a/fastbloom-rs/src/bloom.rs +++ b/fastbloom-rs/src/bloom.rs @@ -1,8 +1,8 @@ -use std::clone; use std::cmp::min; +use std::fs::{File, OpenOptions}; +use std::io::{Write, Read}; use std::ptr::slice_from_raw_parts; -use fastmurmur3::murmur3_x64_128; use xxhash_rust::xxh3::xxh3_64_with_seed; use crate::{Deletable, Hashes, Membership}; @@ -161,6 +161,37 @@ impl BloomFilter { self.config.hashes as u64) } + /// Build a Bloom filter from file with first four bytes is hashes which is encode by big-endian. + /// The remaining is underlying byte vector of the Bloom filter. + pub fn from_file_with_hashes(path: &str) -> Self { + let mut f = File::open(path).unwrap(); + let len = f.metadata().unwrap().len() - 4; + let mut hash = [0; 4]; + f.read_exact(&mut hash).unwrap(); + let hashes = u32::from_be_bytes(hash); + + let mut config = + FilterBuilder::from_size_and_hashes((len * 8) as u64, hashes); + config.complete(); + + let bit_set = BloomBitVec::from_file(&mut f, 4, len); + + BloomFilter { config, bit_set } + } + + /// Build a Bloom filter from file. The content is underlying byte vector of the Bloom filter. + pub fn from_file(path: &str, hashes: u32) -> Self { + let mut f = File::open(path).unwrap(); + let len = f.metadata().unwrap().len(); + let mut config = + FilterBuilder::from_size_and_hashes((len * 8) as u64, hashes); + config.complete(); + + let bit_set = BloomBitVec::from_file(&mut f, 0, len); + + BloomFilter { config, bit_set } + } + /// Build a Bloom filter form `&[u8]`. /// /// # Examples @@ -292,6 +323,26 @@ impl BloomFilter { self.config.clone() } + /// Save the bloom filter to file, and the first four bytes is hashes with + /// big-endian, and the remaining bytes is underlying byte vector of the Bloom filter. + pub fn save_to_file_with_hashes(&mut self, path: &str) { + let mut file = File::create(path).unwrap(); + let hash = self.hashes().to_be_bytes(); + file.write_all(&hash).unwrap(); + + let bytes = self.get_u8_array(); + let mut file = OpenOptions::new().append(true).open(path).unwrap(); + file.write_all(bytes).unwrap(); + } + + /// Save the bloom filter to file, and the content of the file is underlying byte + /// vector of the Bloom filter. + pub fn save_to_file(&mut self, path: &str) { + let mut file = File::create(path).unwrap(); + let bytes = self.get_u8_array(); + file.write_all(bytes).unwrap(); + } + /// Return the underlying byte vector of the Bloom filter. pub fn get_u8_array(&self) -> &[u8] { let storage = &self.bit_set.storage; @@ -761,6 +812,64 @@ fn bloom_hash_indices_test() { assert_eq!(bloom.contains_hash_indices(&bloom.get_hash_indices(b"world")), false); } +#[test] +fn bloom_large() { + let mut builder = + FilterBuilder::new(1_000_000_000, 0.0001); + let mut bloom = builder.build_bloom_filter(); + + bloom.add(b"hello"); + assert_eq!(bloom.contains(b"hello"), true); + + let bloom = BloomFilter::from_u8_array(bloom.get_u8_array(), bloom.hashes()); + + assert_eq!(bloom.contains(b"hello"), true); + +} + +#[test] +fn bloom_save_and_load_file_hashes() { + { + let mut builder = FilterBuilder::new(1_000_000_000, 0.0001); + let mut bloom = builder.build_bloom_filter(); + + bloom.add(b"hello"); + assert_eq!(bloom.contains(b"hello"), true); + bloom.save_to_file_with_hashes("hello.bloom"); + } + + + let bloom2 = BloomFilter::from_file_with_hashes("hello.bloom"); + fs::remove_file("hello.bloom").unwrap(); + + assert_eq!(bloom2.contains(b"hello"), true); + assert_eq!(bloom2.contains(b"world"), false); + +} + +#[test] +fn bloom_save_and_load_file() { + let mut hashes = 0; + { + let mut builder = FilterBuilder::new(1_000_000_000, 0.0001); + let mut bloom = builder.build_bloom_filter(); + + bloom.add(b"hello"); + assert_eq!(bloom.contains(b"hello"), true); + + hashes = bloom.hashes(); + + bloom.save_to_file("no_hashes.bloom"); + } + + let bloom2 = BloomFilter::from_file("no_hashes.bloom", hashes); + fs::remove_file("no_hashes.bloom").unwrap(); + + assert_eq!(bloom2.contains(b"hello"), true); + assert_eq!(bloom2.contains(b"world"), false); + +} + #[test] fn counting_bloom_test() { diff --git a/fastbloom-rs/src/vec.rs b/fastbloom-rs/src/vec.rs index e8e3c2a..9d5c215 100644 --- a/fastbloom-rs/src/vec.rs +++ b/fastbloom-rs/src/vec.rs @@ -1,4 +1,5 @@ -use core::mem::size_of; +use core::slice; +use std::{fs::File, os::windows::fs::FileExt}; use crate::builder::SUFFIX; @@ -25,12 +26,44 @@ impl BloomBitVec { nbits: (slots * get_usize_len()) as u64, } } + pub fn from_elem(slots: usize, bit: bool) -> Self { BloomBitVec { storage: vec![if bit { !0 } else { 0 }; slots], nbits: (slots * get_usize_len()) as u64, } } + + pub fn from_file(file: &mut File, seek: u64, bytes_len: u64) -> Self { + #[cfg(target_pointer_width = "64")] + let length = bytes_len / 8; + #[cfg(target_pointer_width = "32")] + let length = bytes_len / 4; + + let nbits = bytes_len * 8; + + let mut storage = vec![0usize; length.try_into().unwrap()]; + let ptr = storage.as_mut_ptr(); + let buf = ptr as *mut u8; + let buf = unsafe { + slice::from_raw_parts_mut(buf, bytes_len.try_into().unwrap()) + }; + + let mut cursor = seek; + let mut read = 0u64; + + while read < bytes_len { + let size = file.seek_read(&mut buf[read as usize ..], cursor).unwrap() as u64; + read = read + size; + cursor += size; + } + + + BloomBitVec { + storage, + nbits: nbits.try_into().unwrap() + } + } #[inline] pub fn set(&mut self, index: usize) { diff --git a/fastbloom_rs/fastbloom_rs.pyi b/fastbloom_rs/fastbloom_rs.pyi index 7f23a70..bbb3695 100644 --- a/fastbloom_rs/fastbloom_rs.pyi +++ b/fastbloom_rs/fastbloom_rs.pyi @@ -96,6 +96,12 @@ class PyBloomFilter(object): def get_int_array(self) -> Sequence[int]: ... + def save_to_file_with_hashes(self, path: str): + ... + + def save_to_file(self, path: str): + ... + def clear(self): ... @@ -128,6 +134,14 @@ class PyBloomFilter(object): def from_int_array(array: Sequence[int], hashes: int) -> PyBloomFilter: ... + @staticmethod + def from_file_with_hashes(path: str) -> PyBloomFilter: + ... + + @staticmethod + def from_file(path: str, hashes: int) -> PyBloomFilter: + ... + class PyCountingBloomFilter(object): def add(self, element: Union[str, int, bytes]): diff --git a/fastbloom_rs/filter.py b/fastbloom_rs/filter.py index 52896b0..2b17e66 100644 --- a/fastbloom_rs/filter.py +++ b/fastbloom_rs/filter.py @@ -312,6 +312,20 @@ def get_int_array(self) -> Sequence[int]: """ return self._py_bloom.get_int_array() + def save_to_file_with_hashes(self, path: str): + """ + Save the bloom filter to file, and the first four bytes is hashes with + big-endian, and the remaining bytes is underlying byte vector of the Bloom filter. + """ + return self._py_bloom.save_to_file_with_hashes(path) + + def save_to_file(self, path: str): + """ + Save the bloom filter to file, and the content of the file is underlying byte + vector of the Bloom filter. + """ + return self._py_bloom.save_to_file(path) + def clear(self): """ Removes all elements from the filter (i.e. resets all bits to zero). @@ -388,6 +402,22 @@ def from_int_array(array: Sequence[int], hashes: int) -> "BloomFilter": py_bloom = PyBloomFilter.from_int_array(array, hashes) return BloomFilter(py_bloom) + @staticmethod + def from_file_with_hashes(path: str) -> PyBloomFilter: + """ + Build a Bloom filter from file with first four bytes is hashes which is encode by big-endian. + The remaining is underlying byte vector of the Bloom filter. + """ + py_bloom = PyBloomFilter.from_file_with_hashes(path) + return BloomFilter(py_bloom) + + @staticmethod + def from_file(path: str, hashes: int) -> PyBloomFilter: + """ + Build a Bloom filter from file. The content is underlying byte vector of the Bloom filter. + """ + py_bloom = PyBloomFilter.from_file(path, hashes) + return BloomFilter(py_bloom) class CountingBloomFilter(object): """ diff --git a/fastbloomjvm/native/Cargo.toml b/fastbloomjvm/native/Cargo.toml index 5983910..1434e53 100644 --- a/fastbloomjvm/native/Cargo.toml +++ b/fastbloomjvm/native/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fastbloom" # generated by nativeInit with defaultNativeName -version = "0.5.8" +version = "0.5.9" authors = ["Yan Kun "] edition = "2021" diff --git a/py_tests/test_bloom.py b/py_tests/test_bloom.py index caccedb..3d3affd 100644 --- a/py_tests/test_bloom.py +++ b/py_tests/test_bloom.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from fastbloom_rs import BloomFilter, FilterBuilder +import os def test_bloom_builder(): @@ -66,8 +67,9 @@ def test_bloom_estimate_set_cardinality(): bloom = BloomFilter(100_000_000, 0.01) for data in range(0, 10_000_000): bloom.add_int(data) - - assert (bloom.estimate_set_cardinality() < 10_100_000) and (bloom.estimate_set_cardinality() > 9_900_000) + + assert (bloom.estimate_set_cardinality() < 10_100_000) and ( + bloom.estimate_set_cardinality() > 9_900_000) def test_bloom_op(): @@ -125,7 +127,8 @@ def test_hash_indices(): bloom2 = BloomFilter(100_000_000, 0.01) bloom2.add_str("Yan Kun") - assert bloom.get_hash_indices(b'hello') == bloom2.get_hash_indices(b'hello') + assert bloom.get_hash_indices( + b'hello') == bloom2.get_hash_indices(b'hello') assert bloom.contains_hash_indices(bloom.get_hash_indices(b'hello')) assert bloom.contains_hash_indices(bloom.get_hash_indices(87)) @@ -142,14 +145,59 @@ def test_batch_check(): bloom = BloomFilter(100_000_000, 0.01) inserts = [1, 2, 3, 4, 5, 6, 7, 9, 18, 68, 90, 100] checks = [1, 2, 3, 4, 5, 6, 7, 9, 18, 68, 90, 100, 190, 290, 390] - results = [True, True, True, True, True, True, True, True, True, True, True, True, False, False, False] + results = [True, True, True, True, True, True, True, + True, True, True, True, True, False, False, False] bloom.add_int_batch(inserts) contains = bloom.contains_int_batch(checks) assert contains == results bloom.add_str_batch(list(map(lambda x: str(x), inserts))) - assert bloom.contains_str_batch(list(map(lambda x: str(x), checks))) == results + assert bloom.contains_str_batch( + list(map(lambda x: str(x), checks))) == results bloom.add_bytes_batch(list(map(lambda x: bytes(x), inserts))) - assert bloom.contains_bytes_batch(list(map(lambda x: bytes(x), checks))) == results + assert bloom.contains_bytes_batch( + list(map(lambda x: bytes(x), checks))) == results + + +def test_save_load_hashes_file(): + bloom = BloomFilter(1_000_000_000, 0.0001) + bloom.add_bytes(b'hello') + bloom.add(87) + + assert bloom.contains_bytes(b'hello') + assert bloom.contains_int(87) + assert not bloom.contains('world') + + bloom.save_to_file_with_hashes('fst.bloom') + del bloom # gc + + bloom = BloomFilter.from_file_with_hashes('fst.bloom') + assert bloom.contains_bytes(b'hello') + assert bloom.contains_int(87) + assert not bloom.contains('world') + + os.remove('fst.bloom') + +def test_save_load_file(): + hashes = 0 + bloom = BloomFilter(1_000_000_000, 0.01) + bloom.add_bytes(b'hello') + bloom.add(87) + + hashes = bloom.hashes() + + assert bloom.contains_bytes(b'hello') + assert bloom.contains_int(87) + assert not bloom.contains('world') + + bloom.save_to_file('fst.bloom') + del bloom # gc + + bloom = BloomFilter.from_file('fst.bloom', hashes) + assert bloom.contains_bytes(b'hello') + assert bloom.contains_int(87) + assert not bloom.contains('world') + + os.remove('fst.bloom') diff --git a/src/pybloom.rs b/src/pybloom.rs index 74f3238..9ea56b3 100644 --- a/src/pybloom.rs +++ b/src/pybloom.rs @@ -159,6 +159,14 @@ impl PyBloomFilter { Ok(Vec::from(self.bloomfilter.get_u32_array())) } + pub fn save_to_file_with_hashes(&mut self, path: &str) { + self.bloomfilter.save_to_file_with_hashes(path); + } + + pub fn save_to_file(&mut self, path: &str) { + self.bloomfilter.save_to_file(path); + } + pub fn clear(&mut self) { self.bloomfilter.clear() } @@ -201,6 +209,16 @@ impl PyBloomFilter { pub fn from_int_array(array: Vec, hashes: u32) -> PyResult { Ok(PyBloomFilter { bloomfilter: BloomFilter::from_u32_array(array.as_slice(), hashes) }) } + + #[staticmethod] + pub fn from_file_with_hashes(path: &str) -> PyResult { + Ok(PyBloomFilter { bloomfilter: BloomFilter::from_file_with_hashes(path) }) + } + + #[staticmethod] + pub fn from_file(path: &str, hashes: u32) -> PyResult { + Ok(PyBloomFilter { bloomfilter: BloomFilter::from_file(path, hashes) }) + } } #[pyclass]