diff --git a/arrow/src/array/transform/utils.rs b/arrow/src/array/transform/utils.rs index 8c718c70c176..ed7f455a7b2b 100644 --- a/arrow/src/array/transform/utils.rs +++ b/arrow/src/array/transform/utils.rs @@ -15,7 +15,14 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::OffsetSizeTrait, buffer::MutableBuffer, util::bit_util}; +use crate::{ + array::OffsetSizeTrait, + buffer::MutableBuffer, + util::{ + bit_chunk_iterator::BitChunks, + bit_util::{self, ceil}, + }, +}; /// extends the `buffer` to be able to hold `len` bits, setting all bits of the new size to zero. #[inline] @@ -28,6 +35,7 @@ pub(super) fn resize_for_bits(buffer: &mut MutableBuffer, len: usize) { /// sets all bits on `write_data` on the range `[offset_write..offset_write+len]` to be equal to the /// bits on `data` on the range `[offset_read..offset_read+len]` +/// returns the number of `0` bits `data[offset_read..offset_read+len]` pub(super) fn set_bits( write_data: &mut [u8], data: &[u8], @@ -35,15 +43,37 @@ pub(super) fn set_bits( offset_read: usize, len: usize, ) -> usize { - let mut count = 0; - (0..len).for_each(|i| { - if bit_util::get_bit(data, offset_read + i) { - bit_util::set_bit(write_data, offset_write + i); - } else { - count += 1; - } + let mut null_count = 0; + + let mut bits_to_align = offset_write % 8; + if bits_to_align > 0 { + bits_to_align = std::cmp::min(len, 8 - bits_to_align); + } + let mut write_byte_index = ceil(offset_write + bits_to_align, 8); + + // Set full bytes provided by bit chunk iterator (which iterates in 64 bits at a time) + let chunks = BitChunks::new(data, offset_read + bits_to_align, len - bits_to_align); + chunks.iter().for_each(|chunk| { + null_count += chunk.count_zeros(); + chunk.to_le_bytes().iter().for_each(|b| { + write_data[write_byte_index] = *b; + write_byte_index += 1; + }) }); - count + + // Set individual bits both to align write_data to a byte offset and the remainder bits not covered by the bit chunk iterator + let remainder_offset = len - chunks.remainder_len(); + (0..bits_to_align) + .chain(remainder_offset..len) + .for_each(|i| { + if bit_util::get_bit(data, offset_read + i) { + bit_util::set_bit(write_data, offset_write + i); + } else { + null_count += 1; + } + }); + + null_count as usize } pub(super) fn extend_offsets( @@ -74,3 +104,130 @@ pub(super) unsafe fn get_last_offset( debug_assert!(prefix.is_empty() && suffix.is_empty()); *offsets.get_unchecked(offsets.len() - 1) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_set_bits_aligned() { + let mut destination: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let source: &[u8] = &[ + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, + 0b11100111, 0b10100101, + ]; + + let destination_offset = 8; + let source_offset = 0; + + let len = 64; + + let expected_data: &[u8] = &[ + 0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, + 0b11100111, 0b10100101, 0, + ]; + let expected_null_count = 24; + let result = set_bits( + destination.as_mut_slice(), + source, + destination_offset, + source_offset, + len, + ); + + assert_eq!(destination, expected_data); + assert_eq!(result, expected_null_count); + } + + #[test] + fn test_set_bits_unaligned_destination_start() { + let mut destination: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let source: &[u8] = &[ + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, + 0b11100111, 0b10100101, + ]; + + let destination_offset = 3; + let source_offset = 0; + + let len = 64; + + let expected_data: &[u8] = &[ + 0b00111000, 0b00101111, 0b11001101, 0b11011100, 0b01011110, 0b00011111, + 0b00111110, 0b00101111, 0b00000101, 0b00000000, + ]; + let expected_null_count = 24; + let result = set_bits( + destination.as_mut_slice(), + source, + destination_offset, + source_offset, + len, + ); + + assert_eq!(destination, expected_data); + assert_eq!(result, expected_null_count); + } + + #[test] + fn test_set_bits_unaligned_destination_end() { + let mut destination: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let source: &[u8] = &[ + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, + 0b11100111, 0b10100101, + ]; + + let destination_offset = 8; + let source_offset = 0; + + let len = 62; + + let expected_data: &[u8] = &[ + 0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, + 0b11100111, 0b00100101, 0, + ]; + let expected_null_count = 23; + let result = set_bits( + destination.as_mut_slice(), + source, + destination_offset, + source_offset, + len, + ); + + assert_eq!(destination, expected_data); + assert_eq!(result, expected_null_count); + } + + #[test] + fn test_set_bits_unaligned() { + let mut destination: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let source: &[u8] = &[ + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, + ]; + + let destination_offset = 3; + let source_offset = 5; + + let len = 95; + + let expected_data: &[u8] = &[ + 0b01111000, 0b01101001, 0b11100110, 0b11110110, 0b11111010, 0b11110000, + 0b01111001, 0b01101001, 0b11100110, 0b11110110, 0b11111010, 0b11110000, + 0b00000001, + ]; + let expected_null_count = 35; + let result = set_bits( + destination.as_mut_slice(), + source, + destination_offset, + source_offset, + len, + ); + + assert_eq!(destination, expected_data); + assert_eq!(result, expected_null_count); + } +}