Skip to content

Commit

Permalink
perf: Batch nested embed parquet decoding
Browse files Browse the repository at this point in the history
This continues on #17542 and now also batches the actual Nesting, not only the top decoder.

I did the same benchmark with `POLARS_MAX_THREADS=1` and we again see a small
speedup. But I suspect that the speedup is larger for `FixedSizeList`s and
`Struct`s.

```
Benchmark 1: After Optimization
  Time (mean ± σ):     13.306 s ±  0.018 s    [User: 12.661 s, System: 0.614 s]
  Range (min … max):   13.263 s … 13.335 s    10 runs

Benchmark 2: Before Optimization
  Time (mean ± σ):     14.973 s ±  0.065 s    [User: 14.422 s, System: 0.517 s]
  Range (min … max):   14.887 s … 15.129 s    10 runs

Summary
  After Optimization ran
    1.13 ± 0.01 times faster than Before Optimization
```
  • Loading branch information
coastalwhite committed Jul 10, 2024
1 parent a55d9bc commit 0b157c5
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 130 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ impl<'a> Iterator for BinaryIter<'a> {

#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.values.is_empty() || self.num_values == 0 {
if self.num_values == 0 {
assert!(self.values.is_empty());
return None;
}

Expand Down
6 changes: 3 additions & 3 deletions crates/polars-parquet/src/arrow/read/deserialize/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ pub fn create_list(
nested: &mut NestedState,
values: Box<dyn Array>,
) -> Box<dyn Array> {
let (mut offsets, validity) = nested.nested.pop().unwrap().take();
let (mut offsets, validity) = nested.pop().unwrap();
match data_type.to_logical_type() {
ArrowDataType::List(_) => {
offsets.push(values.len() as i64);
Expand Down Expand Up @@ -89,7 +89,7 @@ pub fn create_map(
nested: &mut NestedState,
values: Box<dyn Array>,
) -> Box<dyn Array> {
let (mut offsets, validity) = nested.nested.pop().unwrap().take();
let (mut offsets, validity) = nested.pop().unwrap();
match data_type.to_logical_type() {
ArrowDataType::Map(_, _) => {
offsets.push(values.len() as i64);
Expand Down Expand Up @@ -147,7 +147,7 @@ where
chunk_size,
num_rows,
)?
.map(|x| Ok((NestedState::new(vec![]), x?))),
.map(|x| Ok((NestedState::default(), x?))),
));
}

Expand Down
10 changes: 5 additions & 5 deletions crates/polars-parquet/src/arrow/read/deserialize/nested.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ where
{
Box::new(iter.map(|x| {
x.map(|(mut nested, array)| {
let _ = nested.nested.pop().unwrap(); // the primitive
let _ = nested.pop().unwrap(); // the primitive
(nested, array)
})
}))
Expand All @@ -28,7 +28,7 @@ where
{
Box::new(iter.map(|x| {
x.map(|(mut nested, array)| {
let _ = nested.nested.pop().unwrap(); // the primitive
let _ = nested.pop().unwrap(); // the primitive
(nested, Box::new(array) as _)
})
}))
Expand Down Expand Up @@ -322,7 +322,7 @@ where
validity,
)?);

let _ = nested.nested.pop().unwrap(); // the primitive
let _ = nested.pop().unwrap(); // the primitive

Ok((nested, array))
});
Expand Down Expand Up @@ -380,7 +380,7 @@ where
validity,
)?);

let _ = nested.nested.pop().unwrap(); // the primitive
let _ = nested.pop().unwrap(); // the primitive

Ok((nested, array))
});
Expand Down Expand Up @@ -411,7 +411,7 @@ where
validity,
)?);

let _ = nested.nested.pop().unwrap(); // the primitive
let _ = nested.pop().unwrap(); // the primitive

Ok((nested, array))
});
Expand Down
Loading

0 comments on commit 0b157c5

Please sign in to comment.