Skip to content

Commit

Permalink
perf: add utf8-validation fast paths for utf8view (#14644)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Feb 23, 2024
1 parent 9117a9e commit 9560c34
Showing 1 changed file with 38 additions and 13 deletions.
51 changes: 38 additions & 13 deletions crates/polars-arrow/src/array/binview/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,23 +195,48 @@ pub(super) unsafe fn validate_utf8_only(
views: &[View],
buffers: &[Buffer<u8>],
) -> PolarsResult<()> {
for view in views {
let len = view.length;
if len <= 12 {
// If we have no buffers, we don't have to branch.
if buffers.is_empty() {
for view in views {
let len = view.length;
validate_utf8(
view.to_le_bytes()
.get_unchecked_release(4..4 + len as usize),
)?;
} else {
let buffer_idx = view.buffer_idx;
let offset = view.offset;
let data = buffers.get_unchecked_release(buffer_idx as usize);

let start = offset as usize;
let end = start + len as usize;
let b = &data.as_slice().get_unchecked_release(start..end);
validate_utf8(b)?;
};
}
return Ok(());
}

// Fast path if all buffers are ascii
if buffers.iter().all(|buf| buf.is_ascii()) {
for view in views {
let len = view.length;
if len <= 12 {
validate_utf8(
view.to_le_bytes()
.get_unchecked_release(4..4 + len as usize),
)?;
}
}
} else {
for view in views {
let len = view.length;
if len <= 12 {
validate_utf8(
view.to_le_bytes()
.get_unchecked_release(4..4 + len as usize),
)?;
} else {
let buffer_idx = view.buffer_idx;
let offset = view.offset;
let data = buffers.get_unchecked_release(buffer_idx as usize);

let start = offset as usize;
let end = start + len as usize;
let b = &data.as_slice().get_unchecked_release(start..end);
validate_utf8(b)?;
};
}
}

Ok(())
Expand Down

0 comments on commit 9560c34

Please sign in to comment.