diff --git a/src/encoding/delta_byte_array/decoder.rs b/src/encoding/delta_byte_array/decoder.rs index 556797e2..6e8f2085 100644 --- a/src/encoding/delta_byte_array/decoder.rs +++ b/src/encoding/delta_byte_array/decoder.rs @@ -74,4 +74,35 @@ mod tests { assert_eq!(values, expected_values); Ok(()) } + + #[test] + fn test_with_prefix() -> Result<(), Error> { + // VALIDATED from spark==3.1.1 + let data = &[ + 128, 1, 4, 2, 0, 6, 0, 0, 0, 0, 128, 1, 4, 2, 10, 4, 0, 0, 0, 0, 72, 101, 108, 108, + 111, 105, 99, 111, 112, 116, 101, 114, + // extra bytes are not from spark, but they should be ignored by the decoder + // because they are beyond the sum of all lengths. + 1, 2, 3, + ]; + // result of encoding + let expected_lengths = vec![5, 7]; + let expected_prefixes = vec![0, 3]; + let expected_values = b"Helloicopter"; + + let mut decoder = Decoder::try_new(data)?; + let prefixes = decoder.by_ref().collect::, _>>()?; + assert_eq!(prefixes, expected_prefixes); + + // move to the lengths + let mut decoder = decoder.into_lengths()?; + + let lengths = decoder.by_ref().collect::, _>>()?; + assert_eq!(lengths, expected_lengths); + + // move to the values + let values = decoder.values(); + assert_eq!(values, expected_values); + Ok(()) + } } diff --git a/src/encoding/delta_byte_array/encoder.rs b/src/encoding/delta_byte_array/encoder.rs new file mode 100644 index 00000000..e0a55d84 --- /dev/null +++ b/src/encoding/delta_byte_array/encoder.rs @@ -0,0 +1,36 @@ +use crate::encoding::delta_length_byte_array; + +use super::super::delta_bitpacked; + +/// Encodes an iterator of according to DELTA_BYTE_ARRAY +pub fn encode<'a, I: Iterator + Clone>(iterator: I, buffer: &mut Vec) { + let mut previous = b"".as_ref(); + + let mut sum_lengths = 0; + let prefixes = iterator + .clone() + .map(|item| { + let prefix_length = item + .iter() + .zip(previous.iter()) + .enumerate() + // find first difference + .find_map(|(length, (lhs, rhs))| { + println!("{lhs} {rhs}"); + (lhs != rhs).then(|| length) + }) + .unwrap_or(previous.len()); + previous = item; + + sum_lengths += item.len() - prefix_length; + prefix_length as i64 + }) + .collect::>(); + delta_bitpacked::encode(prefixes.iter().copied(), buffer); + + let remaining = iterator + .zip(prefixes) + .map(|(item, prefix)| &item[prefix as usize..]); + + delta_length_byte_array::encode(remaining, buffer); +} diff --git a/src/encoding/delta_byte_array/mod.rs b/src/encoding/delta_byte_array/mod.rs index 32f89d6d..12d28ed7 100644 --- a/src/encoding/delta_byte_array/mod.rs +++ b/src/encoding/delta_byte_array/mod.rs @@ -1,3 +1,33 @@ mod decoder; +mod encoder; pub use decoder::Decoder; +pub use encoder::encode; + +#[cfg(test)] +mod tests { + use super::*; + use crate::error::Error; + + #[test] + fn basic() -> Result<(), Error> { + let data = vec![b"Hello".as_ref(), b"Helicopter"]; + let mut buffer = vec![]; + encode(data.clone().into_iter(), &mut buffer); + + let mut decoder = Decoder::try_new(&buffer)?; + let prefixes = decoder.by_ref().collect::, _>>()?; + assert_eq!(prefixes, vec![0, 3]); + + // move to the lengths + let mut decoder = decoder.into_lengths()?; + + let lengths = decoder.by_ref().collect::, _>>()?; + assert_eq!(lengths, vec![5, 7]); + + // move to the values + let values = decoder.values(); + assert_eq!(values, b"Helloicopter"); + Ok(()) + } +}