Skip to content

Commit

Permalink
perf(rust, python): rechunk dataframe before unique computation (pola…
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored and konstin committed May 8, 2023
1 parent 61dbb70 commit 5b4fc64
Showing 1 changed file with 13 additions and 10 deletions.
23 changes: 13 additions & 10 deletions polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3005,19 +3005,22 @@ impl DataFrame {
Some(s) => s.iter().map(|s| &**s).collect(),
None => self.get_column_names(),
};
let mut df = self.clone();
// take on multiple chunks is terrible
df.as_single_chunk_par();

let columns = match (keep, maintain_order) {
(UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
let gb = self.groupby_stable(names)?;
let gb = df.groupby_stable(names)?;
let groups = gb.get_groups();
let (offset, len) = slice.unwrap_or((0, groups.len()));
let groups = groups.slice(offset, len);
self.apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
df.apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
}
(UniqueKeepStrategy::Last, true) => {
// maintain order by last values, so the sorted groups are not correct as they
// are sorted by the first value
let gb = self.groupby(names)?;
let gb = df.groupby(names)?;
let groups = gb.get_groups();

let func = |g: GroupsIndicator| match g {
Expand All @@ -3034,30 +3037,30 @@ impl DataFrame {
};

let last_idx = last_idx.sort(false);
return Ok(unsafe { self.take_unchecked(&last_idx) });
return Ok(unsafe { df.take_unchecked(&last_idx) });
}
(UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
let gb = self.groupby(names)?;
let gb = df.groupby(names)?;
let groups = gb.get_groups();
let (offset, len) = slice.unwrap_or((0, groups.len()));
let groups = groups.slice(offset, len);
self.apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
df.apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
}
(UniqueKeepStrategy::Last, false) => {
let gb = self.groupby(names)?;
let gb = df.groupby(names)?;
let groups = gb.get_groups();
let (offset, len) = slice.unwrap_or((0, groups.len()));
let groups = groups.slice(offset, len);
self.apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
df.apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
}
(UniqueKeepStrategy::None, _) => {
let df_part = self.select(names)?;
let df_part = df.select(names)?;
let mask = df_part.is_unique()?;
let mask = match slice {
None => mask,
Some((offset, len)) => mask.slice(offset, len),
};
return self.filter(&mask);
return df.filter(&mask);
}
};
Ok(DataFrame::new_no_checks(columns))
Expand Down

0 comments on commit 5b4fc64

Please sign in to comment.