Skip to content

Commit

Permalink
fix(rust, python): fix ub due to invalid dtype on splitting dfs (pola…
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored and zundertj committed Jan 7, 2023
1 parent f01558a commit 3f2fd2f
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 2 deletions.
31 changes: 31 additions & 0 deletions polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2378,6 +2378,21 @@ impl DataFrame {
}
}

/// Iterator over the rows in this `DataFrame` as Arrow RecordBatches as physical values.
///
/// # Panics
///
/// Panics if the `DataFrame` that is passed is not rechunked.
///
/// This responsibility is left to the caller as we don't want to take mutable references here,
/// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
/// as well.
pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
PhysRecordBatchIter {
iters: self.columns.iter().map(|s| s.chunks().iter()).collect(),
}
}

/// Get a `DataFrame` with all the columns in reversed order.
#[must_use]
pub fn reverse(&self) -> Self {
Expand Down Expand Up @@ -3346,6 +3361,22 @@ impl<'a> Iterator for RecordBatchIter<'a> {
}
}

pub struct PhysRecordBatchIter<'a> {
iters: Vec<std::slice::Iter<'a, ArrayRef>>,
}

impl Iterator for PhysRecordBatchIter<'_> {
type Item = ArrowChunk;

fn next(&mut self) -> Option<Self::Item> {
self.iters
.iter_mut()
.map(|phys_iter| phys_iter.next().cloned())
.collect::<Option<Vec<_>>>()
.map(ArrowChunk::new)
}
}

impl Default for DataFrame {
fn default() -> Self {
DataFrame::new_no_checks(vec![])
Expand Down
4 changes: 3 additions & 1 deletion polars/polars-core/src/series/iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ impl FromIterator<String> for Series {
}
}

pub type SeriesPhysIter<'a> = Box<dyn ExactSizeIterator<Item = AnyValue<'a>> + 'a>;

#[cfg(any(feature = "rows", feature = "dtype-struct"))]
impl Series {
/// iterate over [`Series`] as [`AnyValue`].
Expand All @@ -76,7 +78,7 @@ impl Series {
}
}

pub fn phys_iter(&self) -> Box<dyn ExactSizeIterator<Item = AnyValue<'_>> + '_> {
pub fn phys_iter(&self) -> SeriesPhysIter<'_> {
let dtype = self.dtype();
let phys_dtype = dtype.to_physical();

Expand Down
2 changes: 1 addition & 1 deletion polars/polars-core/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ pub fn split_series(s: &Series, n: usize) -> PolarsResult<Vec<Series>> {
}

fn flatten_df(df: &DataFrame) -> impl Iterator<Item = DataFrame> + '_ {
df.iter_chunks().flat_map(|chunk| {
df.iter_chunks_physical().flat_map(|chunk| {
let df = DataFrame::new_no_checks(
df.iter()
.zip(chunk.into_arrays())
Expand Down

0 comments on commit 3f2fd2f

Please sign in to comment.