Skip to content

Commit

Permalink
Optimize array::transform::utils::set_bits (#716)
Browse files Browse the repository at this point in the history
* Added tests

* Updated tests and improved implementation

* Cleanup

* Stopped collecting bytes before writing to write_data

* Added tests

* Cleanup and comments

* Fixed clippy warning

* Fixed an endianess issue

* Fixed comments and naming

* Made tests less prone to off-by-n errors
  • Loading branch information
mathiaspeters-sig authored Sep 9, 2021
1 parent 72d7c5b commit 4221099
Showing 1 changed file with 166 additions and 9 deletions.
175 changes: 166 additions & 9 deletions arrow/src/array/transform/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,14 @@
// specific language governing permissions and limitations
// under the License.

use crate::{array::OffsetSizeTrait, buffer::MutableBuffer, util::bit_util};
use crate::{
array::OffsetSizeTrait,
buffer::MutableBuffer,
util::{
bit_chunk_iterator::BitChunks,
bit_util::{self, ceil},
},
};

/// extends the `buffer` to be able to hold `len` bits, setting all bits of the new size to zero.
#[inline]
Expand All @@ -28,22 +35,45 @@ pub(super) fn resize_for_bits(buffer: &mut MutableBuffer, len: usize) {

/// sets all bits on `write_data` on the range `[offset_write..offset_write+len]` to be equal to the
/// bits on `data` on the range `[offset_read..offset_read+len]`
/// returns the number of `0` bits `data[offset_read..offset_read+len]`
pub(super) fn set_bits(
write_data: &mut [u8],
data: &[u8],
offset_write: usize,
offset_read: usize,
len: usize,
) -> usize {
let mut count = 0;
(0..len).for_each(|i| {
if bit_util::get_bit(data, offset_read + i) {
bit_util::set_bit(write_data, offset_write + i);
} else {
count += 1;
}
let mut null_count = 0;

let mut bits_to_align = offset_write % 8;
if bits_to_align > 0 {
bits_to_align = std::cmp::min(len, 8 - bits_to_align);
}
let mut write_byte_index = ceil(offset_write + bits_to_align, 8);

// Set full bytes provided by bit chunk iterator (which iterates in 64 bits at a time)
let chunks = BitChunks::new(data, offset_read + bits_to_align, len - bits_to_align);
chunks.iter().for_each(|chunk| {
null_count += chunk.count_zeros();
chunk.to_le_bytes().iter().for_each(|b| {
write_data[write_byte_index] = *b;
write_byte_index += 1;
})
});
count

// Set individual bits both to align write_data to a byte offset and the remainder bits not covered by the bit chunk iterator
let remainder_offset = len - chunks.remainder_len();
(0..bits_to_align)
.chain(remainder_offset..len)
.for_each(|i| {
if bit_util::get_bit(data, offset_read + i) {
bit_util::set_bit(write_data, offset_write + i);
} else {
null_count += 1;
}
});

null_count as usize
}

pub(super) fn extend_offsets<T: OffsetSizeTrait>(
Expand Down Expand Up @@ -74,3 +104,130 @@ pub(super) unsafe fn get_last_offset<T: OffsetSizeTrait>(
debug_assert!(prefix.is_empty() && suffix.is_empty());
*offsets.get_unchecked(offsets.len() - 1)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_set_bits_aligned() {
let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
let source: &[u8] = &[
0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011,
0b11100111, 0b10100101,
];

let destination_offset = 8;
let source_offset = 0;

let len = 64;

let expected_data: &[u8] = &[
0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011,
0b11100111, 0b10100101, 0,
];
let expected_null_count = 24;
let result = set_bits(
destination.as_mut_slice(),
source,
destination_offset,
source_offset,
len,
);

assert_eq!(destination, expected_data);
assert_eq!(result, expected_null_count);
}

#[test]
fn test_set_bits_unaligned_destination_start() {
let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
let source: &[u8] = &[
0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011,
0b11100111, 0b10100101,
];

let destination_offset = 3;
let source_offset = 0;

let len = 64;

let expected_data: &[u8] = &[
0b00111000, 0b00101111, 0b11001101, 0b11011100, 0b01011110, 0b00011111,
0b00111110, 0b00101111, 0b00000101, 0b00000000,
];
let expected_null_count = 24;
let result = set_bits(
destination.as_mut_slice(),
source,
destination_offset,
source_offset,
len,
);

assert_eq!(destination, expected_data);
assert_eq!(result, expected_null_count);
}

#[test]
fn test_set_bits_unaligned_destination_end() {
let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
let source: &[u8] = &[
0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011,
0b11100111, 0b10100101,
];

let destination_offset = 8;
let source_offset = 0;

let len = 62;

let expected_data: &[u8] = &[
0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011,
0b11100111, 0b00100101, 0,
];
let expected_null_count = 23;
let result = set_bits(
destination.as_mut_slice(),
source,
destination_offset,
source_offset,
len,
);

assert_eq!(destination, expected_data);
assert_eq!(result, expected_null_count);
}

#[test]
fn test_set_bits_unaligned() {
let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
let source: &[u8] = &[
0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011,
0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011,
0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011,
];

let destination_offset = 3;
let source_offset = 5;

let len = 95;

let expected_data: &[u8] = &[
0b01111000, 0b01101001, 0b11100110, 0b11110110, 0b11111010, 0b11110000,
0b01111001, 0b01101001, 0b11100110, 0b11110110, 0b11111010, 0b11110000,
0b00000001,
];
let expected_null_count = 35;
let result = set_bits(
destination.as_mut_slice(),
source,
destination_offset,
source_offset,
len,
);

assert_eq!(destination, expected_data);
assert_eq!(result, expected_null_count);
}
}

0 comments on commit 4221099

Please sign in to comment.