Skip to content

Commit

Permalink
Added delta-byte encoder
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Aug 16, 2022
1 parent 7caafa4 commit 0403de1
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 0 deletions.
31 changes: 31 additions & 0 deletions src/encoding/delta_byte_array/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,35 @@ mod tests {
assert_eq!(values, expected_values);
Ok(())
}

#[test]
fn test_with_prefix() -> Result<(), Error> {
// VALIDATED from spark==3.1.1
let data = &[
128, 1, 4, 2, 0, 6, 0, 0, 0, 0, 128, 1, 4, 2, 10, 4, 0, 0, 0, 0, 72, 101, 108, 108,
111, 105, 99, 111, 112, 116, 101, 114,
// extra bytes are not from spark, but they should be ignored by the decoder
// because they are beyond the sum of all lengths.
1, 2, 3,
];
// result of encoding
let expected_lengths = vec![5, 7];
let expected_prefixes = vec![0, 3];
let expected_values = b"Helloicopter";

let mut decoder = Decoder::try_new(data)?;
let prefixes = decoder.by_ref().collect::<Result<Vec<_>, _>>()?;
assert_eq!(prefixes, expected_prefixes);

// move to the lengths
let mut decoder = decoder.into_lengths()?;

let lengths = decoder.by_ref().collect::<Result<Vec<_>, _>>()?;
assert_eq!(lengths, expected_lengths);

// move to the values
let values = decoder.values();
assert_eq!(values, expected_values);
Ok(())
}
}
36 changes: 36 additions & 0 deletions src/encoding/delta_byte_array/encoder.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use crate::encoding::delta_length_byte_array;

use super::super::delta_bitpacked;

/// Encodes an iterator of according to DELTA_BYTE_ARRAY
pub fn encode<'a, I: Iterator<Item = &'a [u8]> + Clone>(iterator: I, buffer: &mut Vec<u8>) {
let mut previous = b"".as_ref();

let mut sum_lengths = 0;
let prefixes = iterator
.clone()
.map(|item| {
let prefix_length = item
.iter()
.zip(previous.iter())
.enumerate()
// find first difference
.find_map(|(length, (lhs, rhs))| {
println!("{lhs} {rhs}");
(lhs != rhs).then(|| length)
})
.unwrap_or(previous.len());
previous = item;

sum_lengths += item.len() - prefix_length;
prefix_length as i64
})
.collect::<Vec<_>>();
delta_bitpacked::encode(prefixes.iter().copied(), buffer);

let remaining = iterator
.zip(prefixes)
.map(|(item, prefix)| &item[prefix as usize..]);

delta_length_byte_array::encode(remaining, buffer);
}
30 changes: 30 additions & 0 deletions src/encoding/delta_byte_array/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,33 @@
mod decoder;
mod encoder;

pub use decoder::Decoder;
pub use encoder::encode;

#[cfg(test)]
mod tests {
use super::*;
use crate::error::Error;

#[test]
fn basic() -> Result<(), Error> {
let data = vec![b"Hello".as_ref(), b"Helicopter"];
let mut buffer = vec![];
encode(data.clone().into_iter(), &mut buffer);

let mut decoder = Decoder::try_new(&buffer)?;
let prefixes = decoder.by_ref().collect::<Result<Vec<_>, _>>()?;
assert_eq!(prefixes, vec![0, 3]);

// move to the lengths
let mut decoder = decoder.into_lengths()?;

let lengths = decoder.by_ref().collect::<Result<Vec<_>, _>>()?;
assert_eq!(lengths, vec![5, 7]);

// move to the values
let values = decoder.values();
assert_eq!(values, b"Helloicopter");
Ok(())
}
}

0 comments on commit 0403de1

Please sign in to comment.