Skip to content

Commit

Permalink
feat(rust, python): is_first for struct dtype (pola-rs#6595)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored and vincent committed Feb 9, 2023
1 parent 950fde6 commit fb5503b
Show file tree
Hide file tree
Showing 16 changed files with 129 additions and 159 deletions.
2 changes: 1 addition & 1 deletion polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ zip_with = ["polars-core/zip_with"]
round_series = ["polars-core/round_series", "polars-lazy/round_series", "polars-ops/round_series"]
checked_arithmetic = ["polars-core/checked_arithmetic"]
repeat_by = ["polars-core/repeat_by", "polars-lazy/repeat_by"]
is_first = ["polars-core/is_first", "polars-lazy/is_first"]
is_first = ["polars-lazy/is_first", "polars-ops/is_first"]
is_last = ["polars-core/is_last"]
asof_join = ["polars-core/asof_join", "polars-lazy/asof_join", "polars-ops/asof_join"]
cross_join = ["polars-core/cross_join", "polars-lazy/cross_join", "polars-ops/cross_join"]
Expand Down
111 changes: 0 additions & 111 deletions polars/polars-core/src/chunked_array/ops/unique/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -424,96 +424,6 @@ impl ChunkUnique<Float64Type> for Float64Chunked {
}
}

#[cfg(feature = "is_first")]
mod is_first {
use arrow::array::BooleanArray;

use super::*;
use crate::utils::CustomIterTools;

fn is_first<T>(ca: &ChunkedArray<T>) -> BooleanChunked
where
T: PolarsNumericType,
T::Native: Hash + Eq,
{
let mut unique = PlHashSet::new();
let chunks = ca
.downcast_iter()
.map(|arr| {
let mask: BooleanArray = arr
.into_iter()
.map(|opt_v| unique.insert(opt_v))
.collect_trusted();
Box::new(mask) as ArrayRef
})
.collect();

unsafe { BooleanChunked::from_chunks(ca.name(), chunks) }
}

impl<T> IsFirst<T> for ChunkedArray<T>
where
T: PolarsNumericType,
{
fn is_first(&self) -> PolarsResult<BooleanChunked> {
use DataType::*;
match self.dtype() {
// cast types to reduce compiler bloat
Int8 | Int16 | UInt8 | UInt16 => {
let s = self.cast(&DataType::Int32).unwrap();
s.is_first()
}
_ => {
if Self::bit_repr_is_large() {
let ca = self.bit_repr_large();
Ok(is_first(&ca))
} else {
let ca = self.bit_repr_small();
Ok(is_first(&ca))
}
}
}
}
}

impl IsFirst<Utf8Type> for Utf8Chunked {
fn is_first(&self) -> PolarsResult<BooleanChunked> {
let mut unique = PlHashSet::new();
let chunks = self
.downcast_iter()
.map(|arr| {
let mask: BooleanArray = arr
.into_iter()
.map(|opt_v| unique.insert(opt_v))
.collect_trusted();
Box::new(mask) as ArrayRef
})
.collect();

unsafe { Ok(BooleanChunked::from_chunks(self.name(), chunks)) }
}
}

#[cfg(feature = "dtype-binary")]
impl IsFirst<BinaryType> for BinaryChunked {
fn is_first(&self) -> PolarsResult<BooleanChunked> {
let mut unique = PlHashSet::new();
let chunks = self
.downcast_iter()
.map(|arr| {
let mask: BooleanArray = arr
.into_iter()
.map(|opt_v| unique.insert(opt_v))
.collect_trusted();
Box::new(mask) as ArrayRef
})
.collect();

unsafe { Ok(BooleanChunked::from_chunks(self.name(), chunks)) }
}
}
}

#[cfg(test)]
mod test {
use crate::prelude::*;
Expand Down Expand Up @@ -566,27 +476,6 @@ mod test {
);
}

#[test]
#[cfg(feature = "is_first")]
fn is_first() {
let ca = UInt32Chunked::new(
"a",
&[Some(1), Some(2), Some(1), Some(1), None, Some(3), None],
);
assert_eq!(
Vec::from(&ca.is_first().unwrap()),
&[
Some(true),
Some(true),
Some(false),
Some(false),
Some(true),
Some(true),
Some(false)
]
);
}

#[test]
#[cfg(feature = "mode")]
fn mode() {
Expand Down
5 changes: 0 additions & 5 deletions polars/polars-core/src/series/implementations/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -329,11 +329,6 @@ impl SeriesTrait for SeriesWrap<BinaryChunked> {
RepeatBy::repeat_by(&self.0, by)
}

#[cfg(feature = "is_first")]
fn is_first(&self) -> PolarsResult<BooleanChunked> {
self.0.is_first()
}

#[cfg(feature = "mode")]
fn mode(&self) -> PolarsResult<Series> {
Ok(self.0.mode()?.into_series())
Expand Down
5 changes: 0 additions & 5 deletions polars/polars-core/src/series/implementations/categorical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -403,11 +403,6 @@ impl SeriesTrait for SeriesWrap<CategoricalChunked> {
casted.list().unwrap().clone()
}

#[cfg(feature = "is_first")]
fn is_first(&self) -> PolarsResult<BooleanChunked> {
self.0.logical().is_first()
}

#[cfg(feature = "mode")]
fn mode(&self) -> PolarsResult<Series> {
Ok(CategoricalChunked::full_null(self.0.logical().name(), 1).into_series())
Expand Down
5 changes: 0 additions & 5 deletions polars/polars-core/src/series/implementations/dates_time.rs
Original file line number Diff line number Diff line change
Expand Up @@ -503,11 +503,6 @@ macro_rules! impl_dyn_series {
_ => unreachable!(),
}
}
#[cfg(feature = "is_first")]
fn is_first(&self) -> PolarsResult<BooleanChunked> {
self.0.is_first()
}

#[cfg(feature = "mode")]
fn mode(&self) -> PolarsResult<Series> {
self.0.mode().map(|ca| ca.$into_logical().into_series())
Expand Down
5 changes: 0 additions & 5 deletions polars/polars-core/src/series/implementations/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -519,11 +519,6 @@ impl SeriesTrait for SeriesWrap<DatetimeChunked> {
.unwrap()
.clone()
}
#[cfg(feature = "is_first")]
fn is_first(&self) -> PolarsResult<BooleanChunked> {
self.0.is_first()
}

#[cfg(feature = "mode")]
fn mode(&self) -> PolarsResult<Series> {
self.0.mode().map(|ca| {
Expand Down
5 changes: 0 additions & 5 deletions polars/polars-core/src/series/implementations/duration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -517,11 +517,6 @@ impl SeriesTrait for SeriesWrap<DurationChunked> {
.unwrap()
.clone()
}
#[cfg(feature = "is_first")]
fn is_first(&self) -> PolarsResult<BooleanChunked> {
self.0.is_first()
}

#[cfg(feature = "mode")]
fn mode(&self) -> PolarsResult<Series> {
self.0
Expand Down
5 changes: 0 additions & 5 deletions polars/polars-core/src/series/implementations/floats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -426,11 +426,6 @@ macro_rules! impl_dyn_series {
self.0.checked_div(rhs)
}

#[cfg(feature = "is_first")]
fn is_first(&self) -> PolarsResult<BooleanChunked> {
self.0.is_first()
}

#[cfg(feature = "mode")]
fn mode(&self) -> PolarsResult<Series> {
Ok(self.0.mode()?.into_series())
Expand Down
5 changes: 0 additions & 5 deletions polars/polars-core/src/series/implementations/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -514,11 +514,6 @@ macro_rules! impl_dyn_series {
self.0.checked_div(rhs)
}

#[cfg(feature = "is_first")]
fn is_first(&self) -> PolarsResult<BooleanChunked> {
self.0.is_first()
}

#[cfg(feature = "object")]
fn as_any(&self) -> &dyn Any {
&self.0
Expand Down
5 changes: 0 additions & 5 deletions polars/polars-core/src/series/implementations/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -337,11 +337,6 @@ impl SeriesTrait for SeriesWrap<Utf8Chunked> {
RepeatBy::repeat_by(&self.0, by)
}

#[cfg(feature = "is_first")]
fn is_first(&self) -> PolarsResult<BooleanChunked> {
self.0.is_first()
}

#[cfg(feature = "mode")]
fn mode(&self) -> PolarsResult<Series> {
Ok(self.0.mode()?.into_series())
Expand Down
6 changes: 0 additions & 6 deletions polars/polars-core/src/series/series_trait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -641,12 +641,6 @@ pub trait SeriesTrait:
invalid_operation_panic!(self)
}

#[cfg(feature = "is_first")]
/// Get a mask of the first unique values.
fn is_first(&self) -> PolarsResult<BooleanChunked> {
invalid_operation_panic!(self)
}

#[cfg(feature = "mode")]
/// Compute the most occurring element in the array.
fn mode(&self) -> PolarsResult<Series> {
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-lazy/polars-plan/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1436,7 +1436,7 @@ impl Expr {
/// Get a mask of the first unique value.
pub fn is_first(self) -> Expr {
self.apply(
|s| s.is_first().map(|ca| ca.into_series()),
|s| is_first(&s).map(|s| s.into_series()),
GetOutput::from_type(DataType::Boolean),
)
.with_fmt("is_first")
Expand Down
1 change: 1 addition & 0 deletions polars/polars-ops/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ propagate_nans = []
performant = ["polars-core/performant"]
big_idx = ["polars-core/bigidx"]
round_series = []
is_first = []

# extra utilities for BinaryChunked
binary_encoding = ["base64", "hex", "dtype-binary"]
Expand Down
115 changes: 115 additions & 0 deletions polars/polars-ops/src/series/ops/is_first.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
use std::hash::Hash;

use arrow::array::BooleanArray;
use arrow::bitmap::MutableBitmap;
use polars_arrow::utils::CustomIterTools;
use polars_core::prelude::*;
use polars_core::with_match_physical_integer_polars_type;

fn is_first_numeric<T>(ca: &ChunkedArray<T>) -> BooleanChunked
where
T: PolarsNumericType,
T::Native: Hash + Eq,
{
let mut unique = PlHashSet::new();
let chunks = ca
.downcast_iter()
.map(|arr| {
let mask: BooleanArray = arr
.into_iter()
.map(|opt_v| unique.insert(opt_v))
.collect_trusted();
Box::new(mask) as ArrayRef
})
.collect();

unsafe { BooleanChunked::from_chunks(ca.name(), chunks) }
}

#[cfg(feature = "dtype-binary")]
fn is_first_bin(ca: &BinaryChunked) -> BooleanChunked {
let mut unique = PlHashSet::new();
let chunks = ca
.downcast_iter()
.map(|arr| {
let mask: BooleanArray = arr
.into_iter()
.map(|opt_v| unique.insert(opt_v))
.collect_trusted();
Box::new(mask) as ArrayRef
})
.collect();

unsafe { BooleanChunked::from_chunks(ca.name(), chunks) }
}

fn is_first_boolean(ca: &BooleanChunked) -> BooleanChunked {
let mut out = MutableBitmap::with_capacity(ca.len());
out.extend_constant(ca.len(), false);
if let Some(index) = ca.arg_max() {
out.set(index, true)
}
if let Some(index) = ca.first_non_null() {
out.set(index, true)
}

let chunks =
vec![Box::new(BooleanArray::new(ArrowDataType::Boolean, out.into(), None)) as ArrayRef];
unsafe { BooleanChunked::from_chunks(ca.name(), chunks) }
}

#[cfg(feature = "dtype-struct")]
fn is_first_struct(s: &Series) -> PolarsResult<BooleanChunked> {
let groups = s.group_tuples(true, false)?;
let first = groups.take_group_firsts();
let mut out = MutableBitmap::with_capacity(s.len());
out.extend_constant(s.len(), false);

for idx in first {
// Group tuples are always in bounds
unsafe { out.set_unchecked(idx as usize, true) }
}
let chunks =
vec![Box::new(BooleanArray::new(ArrowDataType::Boolean, out.into(), None)) as ArrayRef];
Ok(unsafe { BooleanChunked::from_chunks(s.name(), chunks) })
}

pub fn is_first(s: &Series) -> PolarsResult<BooleanChunked> {
let s = s.to_physical_repr();

use DataType::*;
let out = match s.dtype() {
Boolean => {
let ca = s.bool().unwrap();
is_first_boolean(ca)
}
#[cfg(feature = "dtype-binary")]
Binary => {
let ca = s.binary().unwrap();
is_first_bin(ca)
}
#[cfg(feature = "dtype-binary")]
Utf8 => {
let s = s.cast(&Binary).unwrap();
return is_first(&s);
}
Float32 => {
let ca = s.bit_repr_small();
is_first_numeric(&ca)
}
Float64 => {
let ca = s.bit_repr_large();
is_first_numeric(&ca)
}
dt if dt.is_numeric() => {
with_match_physical_integer_polars_type!(s.dtype(), |$T| {
let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
is_first_numeric(ca)
})
}
#[cfg(feature = "dtype-struct")]
Struct(_) => return is_first_struct(&s),
dt => panic!("dtype {dt} not supported in 'is_first' operation"),
};
Ok(out)
}
4 changes: 4 additions & 0 deletions polars/polars-ops/src/series/ops/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#[cfg(feature = "round_series")]
mod floor_divide;
#[cfg(feature = "is_first")]
mod is_first;
#[cfg(feature = "log")]
mod log;
#[cfg(feature = "rolling_window")]
Expand All @@ -10,6 +12,8 @@ mod various;

#[cfg(feature = "round_series")]
pub use floor_divide::*;
#[cfg(feature = "is_first")]
pub use is_first::*;
#[cfg(feature = "log")]
pub use log::*;
use polars_core::prelude::*;
Expand Down
Loading

0 comments on commit fb5503b

Please sign in to comment.