Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Half the size of Booleans in row encoding #19927

Merged
merged 1 commit into from
Nov 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 21 additions & 11 deletions crates/polars-arrow/src/bitmap/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -428,18 +428,15 @@ impl<P: AsRef<[bool]>> From<P> for MutableBitmap {
}
}

impl FromIterator<bool> for MutableBitmap {
fn from_iter<I>(iter: I) -> Self
where
I: IntoIterator<Item = bool>,
{
impl Extend<bool> for MutableBitmap {
fn extend<T: IntoIterator<Item = bool>>(&mut self, iter: T) {
let mut iterator = iter.into_iter();
let mut buffer = {
let byte_capacity: usize = iterator.size_hint().0.saturating_add(7) / 8;
Vec::with_capacity(byte_capacity)
};

let mut length = 0;
let mut buffer = std::mem::take(&mut self.buffer);
let mut length = std::mem::take(&mut self.length);

let byte_capacity: usize = iterator.size_hint().0.saturating_add(7) / 8;
buffer.reserve(byte_capacity);

loop {
let mut exhausted = false;
Expand Down Expand Up @@ -481,7 +478,20 @@ impl FromIterator<bool> for MutableBitmap {
break;
}
}
Self { buffer, length }

self.buffer = buffer;
self.length = length;
}
}

impl FromIterator<bool> for MutableBitmap {
fn from_iter<I>(iter: I) -> Self
where
I: IntoIterator<Item = bool>,
{
let mut bm = Self::new();
bm.extend(iter);
bm
}
}

Expand Down
4 changes: 2 additions & 2 deletions crates/polars-row/src/encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ unsafe fn encode_flat_array(
},
D::Boolean => {
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
crate::fixed::encode_iter(buffer, array.iter(), field, offsets);
crate::fixed::encode_bool_iter(buffer, array.iter(), field, offsets);
},
dt if dt.is_numeric() => with_match_arrow_primitive_type!(dt, |$T| {
let array = array.as_any().downcast_ref::<PrimitiveArray<$T>>().unwrap();
Expand Down Expand Up @@ -815,7 +815,7 @@ pub fn fixed_size(dtype: &ArrowDataType) -> Option<usize> {
Decimal(_, _) => i128::ENCODED_LEN,
Float32 => f32::ENCODED_LEN,
Float64 => f64::ENCODED_LEN,
Boolean => bool::ENCODED_LEN,
Boolean => 1,
FixedSizeList(f, width) => 1 + width * fixed_size(f.dtype())?,
Struct(fs) => {
let mut sum = 0;
Expand Down
101 changes: 60 additions & 41 deletions crates/polars-row/src/fixed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::fmt::Debug;
use std::mem::MaybeUninit;

use arrow::array::{BooleanArray, PrimitiveArray};
use arrow::bitmap::Bitmap;
use arrow::bitmap::{Bitmap, MutableBitmap};
use arrow::datatypes::ArrowDataType;
use arrow::types::NativeType;
use polars_utils::slice::*;
Expand Down Expand Up @@ -41,17 +41,6 @@ pub trait FixedLengthEncoding: Copy + Debug {
}
}

impl FixedLengthEncoding for bool {
type Encoded = [u8; 1];
fn encode(self) -> Self::Encoded {
[self as u8]
}

fn decode(encoded: Self::Encoded) -> Self {
encoded[0] != 0
}
}

// encode as big endian
macro_rules! encode_unsigned {
($n:expr, $t:ty) => {
Expand Down Expand Up @@ -216,6 +205,28 @@ pub(crate) unsafe fn encode_iter<I: Iterator<Item = Option<T>>, T: FixedLengthEn
}
}

pub(crate) unsafe fn encode_bool_iter<I: Iterator<Item = Option<bool>>>(
buffer: &mut [MaybeUninit<u8>],
input: I,
field: &EncodingField,
offsets: &mut [usize],
) {
let null_sentinel = get_null_sentinel(field);
let true_sentinel = field.bool_true_sentinel();
let false_sentinel = field.bool_false_sentinel();

for (offset, opt_value) in offsets.iter_mut().zip(input) {
let b = match opt_value {
None => null_sentinel,
Some(false) => false_sentinel,
Some(true) => true_sentinel,
};

*buffer.get_unchecked_mut(*offset) = MaybeUninit::new(b);
*offset += 1;
}
}

pub(super) unsafe fn decode_primitive<T: NativeType + FixedLengthEncoding>(
rows: &mut [&[u8]],
field: &EncodingField,
Expand Down Expand Up @@ -262,43 +273,51 @@ where
pub(super) unsafe fn decode_bool(rows: &mut [&[u8]], field: &EncodingField) -> BooleanArray {
let mut has_nulls = false;
let null_sentinel = get_null_sentinel(field);
let true_sentinel = field.bool_true_sentinel();

let values = Bitmap::from_trusted_len_iter_unchecked(rows.iter().map(|row| {
let b = *row.get_unchecked(0);
has_nulls |= b == null_sentinel;
b == true_sentinel
}));

if !has_nulls {
rows.iter_mut()
.for_each(|row| *row = row.get_unchecked(1..));
return BooleanArray::new(ArrowDataType::Boolean, values, None);
}

let values = rows
.iter()
.map(|row| {
has_nulls |= *row.get_unchecked(0) == null_sentinel;
// skip null sentinel
let start = 1;
let end = start + bool::ENCODED_LEN - 1;
let slice = row.get_unchecked(start..end);
let bytes = <bool as FixedLengthEncoding>::Encoded::from_slice(slice);

if field.descending {
bool::decode_reverse(bytes)
} else {
bool::decode(bytes)
}
})
.collect::<Bitmap>();

let validity = if has_nulls {
Some(decode_nulls(rows, null_sentinel))
} else {
None
};

// validity byte and data length
let increment_len = bool::ENCODED_LEN;

increment_row_counter(rows, increment_len);
BooleanArray::new(ArrowDataType::Boolean, values, validity)
let validity = Bitmap::from_trusted_len_iter_unchecked(rows.iter_mut().map(|row| {
let v = *row.get_unchecked(0) != null_sentinel;
*row = row.get_unchecked(1..);
v
}));
BooleanArray::new(ArrowDataType::Boolean, values, Some(validity))
}
unsafe fn increment_row_counter(rows: &mut [&[u8]], fixed_size: usize) {
for row in rows {
*row = row.get_unchecked(fixed_size..);
}
}

pub(super) unsafe fn decode_opt_nulls(rows: &[&[u8]], null_sentinel: u8) -> Option<Bitmap> {
let first_null = rows
.iter()
.position(|row| *row.get_unchecked(0) == null_sentinel)?;

let mut bm = MutableBitmap::with_capacity(rows.len());
bm.extend_constant(first_null, true);
bm.push(false);

bm.extend_from_trusted_len_iter_unchecked(
rows[first_null + 1..]
.iter()
.map(|row| *row.get_unchecked(0) != null_sentinel),
);

Some(bm.freeze())
}

pub(super) unsafe fn decode_nulls(rows: &[&[u8]], null_sentinel: u8) -> Bitmap {
rows.iter()
.map(|row| *row.get_unchecked(0) != null_sentinel)
Expand Down
19 changes: 19 additions & 0 deletions crates/polars-row/src/row.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ use arrow::datatypes::ArrowDataType;
use arrow::ffi::mmap;
use arrow::offset::{Offsets, OffsetsBuffer};

const BOOLEAN_TRUE_SENTINEL: u8 = 0x03;
const BOOLEAN_FALSE_SENTINEL: u8 = 0x02;

#[derive(Clone, Default, Copy)]
pub struct EncodingField {
/// Whether to sort in descending order
Expand All @@ -30,6 +33,22 @@ impl EncodingField {
..Default::default()
}
}

pub(crate) fn bool_true_sentinel(self) -> u8 {
if self.descending {
!BOOLEAN_TRUE_SENTINEL
} else {
BOOLEAN_TRUE_SENTINEL
}
}

pub(crate) fn bool_false_sentinel(self) -> u8 {
if self.descending {
!BOOLEAN_FALSE_SENTINEL
} else {
BOOLEAN_FALSE_SENTINEL
}
}
}

#[derive(Default, Clone)]
Expand Down
19 changes: 3 additions & 16 deletions crates/polars-row/src/variable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use arrow::datatypes::ArrowDataType;
use arrow::offset::Offsets;
use polars_utils::slice::Slice2Uninit;

use crate::fixed::{decode_nulls, get_null_sentinel};
use crate::fixed::{decode_opt_nulls, get_null_sentinel};
use crate::EncodingField;

/// The block size of the variable length encoding
Expand Down Expand Up @@ -183,11 +183,6 @@ pub(crate) unsafe fn encode_iter<'a, I: Iterator<Item = Option<&'a [u8]>>>(
}
}

unsafe fn has_nulls(rows: &[&[u8]], null_sentinel: u8) -> bool {
rows.iter()
.any(|row| *row.get_unchecked(0) == null_sentinel)
}

pub(crate) unsafe fn encoded_item_len(
row: &[u8],
non_empty_sentinel: u8,
Expand Down Expand Up @@ -305,11 +300,7 @@ pub(super) unsafe fn decode_binary(rows: &mut [&[u8]], field: &EncodingField) ->
};

let null_sentinel = get_null_sentinel(field);
let validity = if has_nulls(rows, null_sentinel) {
Some(decode_nulls(rows, null_sentinel))
} else {
None
};
let validity = decode_opt_nulls(rows, null_sentinel);
let values_cap = rows
.iter()
.map(|row| {
Expand Down Expand Up @@ -380,11 +371,7 @@ pub(super) unsafe fn decode_binview(rows: &mut [&[u8]], field: &EncodingField) -
};

let null_sentinel = get_null_sentinel(field);
let validity = if has_nulls(rows, null_sentinel) {
Some(decode_nulls(rows, null_sentinel))
} else {
None
};
let validity = decode_opt_nulls(rows, null_sentinel);

let mut mutable = MutableBinaryViewArray::with_capacity(rows.len());

Expand Down
28 changes: 28 additions & 0 deletions py-polars/tests/unit/test_row_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,34 @@ def test_parametric_binary_order(df: pl.DataFrame) -> None:
parametric_order_base(df)


def test_order_bool() -> None:
dtype = pl.Boolean
assert_order_series(
[None, False, True], [True, False, None], dtype, ["lt", "eq", "gt"]
)
assert_order_series(
[None, False, True],
[True, False, None],
dtype,
["gt", "eq", "lt"],
nulls_last=True,
)

assert_order_series(
[False, False, True, True],
[True, False, True, False],
dtype,
["lt", "eq", "eq", "gt"],
)
assert_order_series(
[False, False, True, True],
[True, False, True, False],
dtype,
["lt", "eq", "eq", "gt"],
descending=True,
)


def test_order_int() -> None:
dtype = pl.Int32
assert_order_series([1, 2, 3], [3, 2, 1], dtype, ["lt", "eq", "gt"])
Expand Down