Skip to content

Commit

Permalink
Merge pull request #288 from lemolatoon/from-bitmap-bytes
Browse files Browse the repository at this point in the history
Feature Request: `RoaringBitmap::from_lsb0_bytes`
  • Loading branch information
Kerollmops authored Dec 6, 2024
2 parents f532063 + a10a29c commit 56ba1ff
Show file tree
Hide file tree
Showing 7 changed files with 360 additions and 5 deletions.
23 changes: 23 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,22 @@ jobs:
override: true
components: rustfmt, clippy

- name: Install miri
uses: actions-rs/toolchain@v1
if: matrix.rust == 'nightly'
with:
profile: minimal
toolchain: nightly
override: true
components: miri

- name: miri setup
uses: actions-rs/cargo@v1
if: matrix.rust == 'nightly'
with:
command: miri
args: setup

- name: Fetch
uses: actions-rs/cargo@v1
with:
Expand Down Expand Up @@ -70,6 +86,13 @@ jobs:
command: test
args: --features serde

- name: Test bit endian
uses: actions-rs/cargo@v1
if: matrix.rust == 'nightly'
with:
command: miri
args: test --target s390x-unknown-linux-gnu --package roaring --lib -- bitmap::serialization::test::test_from_lsb0_bytes

- name: Test no default features
uses: actions-rs/cargo@v1
with:
Expand Down
21 changes: 21 additions & 0 deletions benchmarks/benches/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,27 @@ fn creation(c: &mut Criterion) {

group.throughput(Throughput::Elements(dataset.bitmaps.iter().map(|rb| rb.len()).sum()));

group.bench_function(BenchmarkId::new("from_lsb0_bytes", &dataset.name), |b| {
let bitmap_bytes = dataset_numbers
.iter()
.map(|bitmap_numbers| {
let max_number = *bitmap_numbers.iter().max().unwrap() as usize;
let mut buf = vec![0u8; max_number / 8 + 1];
for n in bitmap_numbers {
let byte = (n / 8) as usize;
let bit = n % 8;
buf[byte] |= 1 << bit;
}
buf
})
.collect::<Vec<_>>();
b.iter(|| {
for bitmap_bytes in &bitmap_bytes {
black_box(RoaringBitmap::from_lsb0_bytes(0, bitmap_bytes));
}
})
});

group.bench_function(BenchmarkId::new("from_sorted_iter", &dataset.name), |b| {
b.iter(|| {
for bitmap_numbers in &dataset_numbers {
Expand Down
4 changes: 4 additions & 0 deletions roaring/src/bitmap/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ impl Container {
pub fn full(key: u16) -> Container {
Container { key, store: Store::full() }
}

pub fn from_lsb0_bytes(key: u16, bytes: &[u8], byte_offset: usize) -> Option<Self> {
Some(Container { key, store: Store::from_lsb0_bytes(bytes, byte_offset)? })
}
}

impl Container {
Expand Down
218 changes: 213 additions & 5 deletions roaring/src/bitmap/serialization.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
use crate::bitmap::container::{Container, ARRAY_LIMIT};
use crate::bitmap::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH};
use crate::RoaringBitmap;
use bytemuck::cast_slice_mut;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use core::convert::Infallible;
use core::mem::size_of;
use core::ops::RangeInclusive;
use std::error::Error;
use std::io;

use crate::bitmap::container::{Container, ARRAY_LIMIT};
use crate::bitmap::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH};
use crate::RoaringBitmap;

pub const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
pub const SERIAL_COOKIE: u16 = 12347;
pub const NO_OFFSET_THRESHOLD: usize = 4;
Expand Down Expand Up @@ -47,6 +47,139 @@ impl RoaringBitmap {
8 + container_sizes
}

/// Creates a `RoaringBitmap` from a byte slice, interpreting the bytes as a bitmap with a specified offset.
///
/// # Arguments
///
/// - `offset: u32` - The starting position in the bitmap where the byte slice will be applied, specified in bits.
/// This means that if `offset` is `n`, the first byte in the slice will correspond to the `n`th bit(0-indexed) in the bitmap.
/// - `bytes: &[u8]` - The byte slice containing the bitmap data. The bytes are interpreted in "Least-Significant-First" bit order.
///
/// # Interpretation of `bytes`
///
/// The `bytes` slice is interpreted in "Least-Significant-First" bit order. Each byte is read from least significant bit (LSB) to most significant bit (MSB).
/// For example, the byte `0b00000101` represents the bits `1, 0, 1, 0, 0, 0, 0, 0` in that order (see Examples section).
///
///
/// # Panics
///
/// This function will panic if `bytes.len() + offset` is greater than 2^32.
///
///
/// # Examples
///
/// ```rust
/// use roaring::RoaringBitmap;
///
/// let bytes = [0b00000101, 0b00000010, 0b00000000, 0b10000000];
/// // ^^^^^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^^^^^
/// // 76543210 98
/// let rb = RoaringBitmap::from_lsb0_bytes(0, &bytes);
/// assert!(rb.contains(0));
/// assert!(!rb.contains(1));
/// assert!(rb.contains(2));
/// assert!(rb.contains(9));
/// assert!(rb.contains(31));
///
/// let rb = RoaringBitmap::from_lsb0_bytes(8, &bytes);
/// assert!(rb.contains(8));
/// assert!(!rb.contains(9));
/// assert!(rb.contains(10));
/// assert!(rb.contains(17));
/// assert!(rb.contains(39));
///
/// let rb = RoaringBitmap::from_lsb0_bytes(3, &bytes);
/// assert!(rb.contains(3));
/// assert!(!rb.contains(4));
/// assert!(rb.contains(5));
/// assert!(rb.contains(12));
/// assert!(rb.contains(34));
/// ```
pub fn from_lsb0_bytes(offset: u32, mut bytes: &[u8]) -> RoaringBitmap {
fn shift_bytes(bytes: &[u8], amount: usize) -> Vec<u8> {
let mut result = Vec::with_capacity(bytes.len() + 1);
let mut carry = 0u8;

for &byte in bytes {
let shifted = (byte << amount) | carry;
carry = byte >> (8 - amount);
result.push(shifted);
}

if carry != 0 {
result.push(carry);
}

result
}
if offset % 8 != 0 {
let shift = offset as usize % 8;
let shifted_bytes = shift_bytes(bytes, shift);
return RoaringBitmap::from_lsb0_bytes(offset - shift as u32, &shifted_bytes);
}

if bytes.is_empty() {
return RoaringBitmap::new();
}

// Using inclusive range avoids overflow: the max exclusive value is 2^32 (u32::MAX + 1).
let end_bit_inc = u32::try_from(bytes.len())
.ok()
.and_then(|len_bytes| len_bytes.checked_mul(8))
// `bytes` is non-empty, so len_bits is > 0
.and_then(|len_bits| offset.checked_add(len_bits - 1))
.expect("offset + bytes.len() must be <= 2^32");

// offsets are in bytes
let (mut start_container, start_offset) =
(offset as usize >> 16, (offset as usize % 0x1_0000) / 8);
let (end_container_inc, end_offset) =
(end_bit_inc as usize >> 16, (end_bit_inc as usize % 0x1_0000 + 1) / 8);

let n_containers_needed = end_container_inc + 1 - start_container;
let mut containers = Vec::with_capacity(n_containers_needed);

// Handle a partial first container
if start_offset != 0 {
let end_byte = if end_container_inc == start_container {
end_offset
} else {
BITMAP_LENGTH * size_of::<u64>()
};

let (src, rest) = bytes.split_at(end_byte - start_offset);
bytes = rest;

if let Some(container) =
Container::from_lsb0_bytes(start_container as u16, src, start_offset)
{
containers.push(container);
}

start_container += 1;
}

// Handle all full containers
for full_container_key in start_container..end_container_inc {
let (src, rest) = bytes.split_at(BITMAP_LENGTH * size_of::<u64>());
bytes = rest;

if let Some(container) = Container::from_lsb0_bytes(full_container_key as u16, src, 0) {
containers.push(container);
}
}

// Handle a last container
if !bytes.is_empty() {
if let Some(container) = Container::from_lsb0_bytes(end_container_inc as u16, bytes, 0)
{
containers.push(container);
}
}

RoaringBitmap { containers }
}

/// Serialize this bitmap into [the standard Roaring on-disk format][format].
/// This is compatible with the official C/C++, Java and Go implementations.
///
Expand Down Expand Up @@ -256,7 +389,7 @@ impl RoaringBitmap {

#[cfg(test)]
mod test {
use crate::RoaringBitmap;
use crate::{bitmap::store::BITMAP_LENGTH, RoaringBitmap};
use proptest::prelude::*;

proptest! {
Expand All @@ -270,6 +403,81 @@ mod test {
}
}

#[test]
fn test_from_lsb0_bytes() {
const CONTAINER_OFFSET: u32 = u64::BITS * BITMAP_LENGTH as u32;
const CONTAINER_OFFSET_IN_BYTES: u32 = CONTAINER_OFFSET / 8;
let mut bytes = vec![0xff; CONTAINER_OFFSET_IN_BYTES as usize];
bytes.extend([0x00; CONTAINER_OFFSET_IN_BYTES as usize]);
bytes.extend([0b00000001, 0b00000010, 0b00000011, 0b00000100]);

let offset = 32;
let rb = RoaringBitmap::from_lsb0_bytes(offset, &bytes);
for i in 0..offset {
assert!(!rb.contains(i), "{i} should not be in the bitmap");
}
for i in offset..offset + CONTAINER_OFFSET {
assert!(rb.contains(i), "{i} should be in the bitmap");
}
for i in offset + CONTAINER_OFFSET..offset + CONTAINER_OFFSET * 2 {
assert!(!rb.contains(i), "{i} should not be in the bitmap");
}
for bit in [0, 9, 16, 17, 26] {
let i = bit + offset + CONTAINER_OFFSET * 2;
assert!(rb.contains(i), "{i} should be in the bitmap");
}

assert_eq!(rb.len(), CONTAINER_OFFSET as u64 + 5);

// Ensure the empty container is not created
let mut bytes = vec![0x00u8; CONTAINER_OFFSET_IN_BYTES as usize];
bytes.extend([0xff]);
let rb = RoaringBitmap::from_lsb0_bytes(0, &bytes);
assert_eq!(rb.min(), Some(CONTAINER_OFFSET));

let rb = RoaringBitmap::from_lsb0_bytes(8, &bytes);
assert_eq!(rb.min(), Some(CONTAINER_OFFSET + 8));

// Ensure we can set the last byte in an array container
let bytes = [0x80];
let rb = RoaringBitmap::from_lsb0_bytes(0xFFFFFFF8, &bytes);
assert_eq!(rb.len(), 1);
assert!(rb.contains(u32::MAX));

// Ensure we can set the last byte in a bitmap container
let bytes = vec![0xFF; 0x1_0000 / 8];
let rb = RoaringBitmap::from_lsb0_bytes(0xFFFF0000, &bytes);
assert_eq!(rb.len(), 0x1_0000);
assert!(rb.contains(u32::MAX));
}

#[test]
fn test_from_lsb0_bytes_not_multiple_of_8() {
const CONTAINER_OFFSET: u32 = u64::BITS * BITMAP_LENGTH as u32;
const CONTAINER_OFFSET_IN_BYTES: u32 = CONTAINER_OFFSET / 8;

let mut bytes = vec![0b0101_1001];
bytes.extend([0x00; CONTAINER_OFFSET_IN_BYTES as usize]);
bytes.extend([0b00000001, 0b00000010, 0b00000011, 0b00000100]);

let mut indices = vec![0, 3, 4, 6];
indices.extend([0, 9, 16, 17, 26].map(|i| 8 + CONTAINER_OFFSET + i));

for offset in 0..8 {
let rb = RoaringBitmap::from_lsb0_bytes(offset, &bytes);
for i in indices.iter().map(|&i| i + offset) {
assert!(rb.contains(i), "{i} should be in the bitmap");
}
}
}

#[test]
#[should_panic(expected = "<= 2^32")]
fn test_from_lsb0_bytes_overflow() {
let bytes = [0x01, 0x01];
RoaringBitmap::from_lsb0_bytes(u32::MAX - 7, &bytes);
}

#[test]
fn test_deserialize_overflow_s_plus_len() {
let data = vec![59, 48, 0, 0, 255, 130, 254, 59, 48, 2, 0, 41, 255, 255, 166, 197, 4, 0, 2];
Expand Down
28 changes: 28 additions & 0 deletions roaring/src/bitmap/store/array_store/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::bitmap::store::array_store::visitor::{CardinalityCounter, VecWriter};
use core::cmp::Ordering;
use core::cmp::Ordering::*;
use core::fmt::{Display, Formatter};
use core::mem::size_of;
use core::ops::{BitAnd, BitAndAssign, BitOr, BitXor, RangeInclusive, Sub, SubAssign};

#[cfg(not(feature = "std"))]
Expand Down Expand Up @@ -52,6 +53,33 @@ impl ArrayStore {
}
}

pub fn from_lsb0_bytes(bytes: &[u8], byte_offset: usize, bits_set: u64) -> Self {
type Word = u64;

let mut vec = Vec::with_capacity(bits_set as usize);

let chunks = bytes.chunks_exact(size_of::<Word>());
let remainder = chunks.remainder();
for (index, chunk) in chunks.enumerate() {
let bit_index = (byte_offset + index * size_of::<Word>()) * 8;
let mut word = Word::from_le_bytes(chunk.try_into().unwrap());

while word != 0 {
vec.push((word.trailing_zeros() + bit_index as u32) as u16);
word &= word - 1;
}
}
for (index, mut byte) in remainder.iter().copied().enumerate() {
let bit_index = (byte_offset + (bytes.len() - remainder.len()) + index) * 8;
while byte != 0 {
vec.push((byte.trailing_zeros() + bit_index as u32) as u16);
byte &= byte - 1;
}
}

Self::from_vec_unchecked(vec)
}

#[inline]
pub fn insert(&mut self, index: u16) -> bool {
self.vec.binary_search(&index).map_err(|loc| self.vec.insert(loc, index)).is_err()
Expand Down
Loading

0 comments on commit 56ba1ff

Please sign in to comment.