From b9f3f429b9b3e0a308c845777bb34e3c729e6e03 Mon Sep 17 00:00:00 2001 From: Marshall Date: Fri, 7 Jun 2024 13:01:04 -0400 Subject: [PATCH] feat(rust, python): Return datetime for mean/median of Date colum (#16795) --- .../frame/group_by/aggregations/dispatch.rs | 122 +++++++++++------- .../src/series/implementations/date.rs | 13 +- crates/polars-core/src/series/mod.rs | 21 ++- .../sinks/group_by/aggregates/convert.rs | 2 +- .../src/logical_plan/aexpr/schema.rs | 10 +- .../src/logical_plan/conversion/dsl_to_ir.rs | 22 +--- py-polars/polars/dataframe/frame.py | 66 +++++----- py-polars/polars/lazyframe/frame.py | 80 ++++++------ py-polars/polars/series/datetime.py | 40 ++---- py-polars/src/series/aggregation.rs | 11 +- .../tests/unit/dataframe/test_describe.py | 2 +- .../namespaces/temporal/test_datetime.py | 28 +++- .../tests/unit/operations/test_group_by.py | 14 +- py-polars/tests/unit/series/test_describe.py | 2 +- .../unit/streaming/test_streaming_group_by.py | 8 +- 15 files changed, 253 insertions(+), 188 deletions(-) diff --git a/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs b/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs index 6b0258f7ac771..447352a0faaf3 100644 --- a/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs +++ b/crates/polars-core/src/frame/group_by/aggregations/dispatch.rs @@ -127,6 +127,58 @@ impl Series { } } + #[doc(hidden)] + pub unsafe fn agg_mean(&self, groups: &GroupsProxy) -> Series { + // Prevent a rechunk for every individual group. + let s = if groups.len() > 1 { + self.rechunk() + } else { + self.clone() + }; + + use DataType::*; + match s.dtype() { + Boolean => s.cast(&Float64).unwrap().agg_mean(groups), + Float32 => SeriesWrap(s.f32().unwrap().clone()).agg_mean(groups), + Float64 => SeriesWrap(s.f64().unwrap().clone()).agg_mean(groups), + dt if dt.is_numeric() => apply_method_physical_integer!(s, agg_mean, groups), + #[cfg(feature = "dtype-datetime")] + dt @ Datetime(_, _) => self + .to_physical_repr() + .agg_mean(groups) + .cast(&Int64) + .unwrap() + .cast(dt) + .unwrap(), + #[cfg(feature = "dtype-duration")] + dt @ Duration(_) => self + .to_physical_repr() + .agg_mean(groups) + .cast(&Int64) + .unwrap() + .cast(dt) + .unwrap(), + #[cfg(feature = "dtype-time")] + Time => self + .to_physical_repr() + .agg_mean(groups) + .cast(&Int64) + .unwrap() + .cast(&Time) + .unwrap(), + #[cfg(feature = "dtype-date")] + Date => (self + .to_physical_repr() + .agg_mean(groups) + .cast(&Float64) + .unwrap() + * (MS_IN_DAY as f64)) + .cast(&Datetime(TimeUnit::Milliseconds, None)) + .unwrap(), + _ => Series::full_null("", groups.len(), s.dtype()), + } + } + #[doc(hidden)] pub unsafe fn agg_median(&self, groups: &GroupsProxy) -> Series { // Prevent a rechunk for every individual group. @@ -143,21 +195,38 @@ impl Series { Float64 => SeriesWrap(s.f64().unwrap().clone()).agg_median(groups), dt if dt.is_numeric() => apply_method_physical_integer!(s, agg_median, groups), #[cfg(feature = "dtype-datetime")] - dt @ (Datetime(_, _) | Duration(_) | Time) => s + dt @ Datetime(_, _) => self .to_physical_repr() .agg_median(groups) .cast(&Int64) .unwrap() .cast(dt) .unwrap(), - dt @ Date => { - let ca = s.to_physical_repr(); - let physical_type = ca.dtype(); - let s = apply_method_physical_integer!(ca, agg_median, groups); - // back to physical and then - // back to logical type - s.cast(physical_type).unwrap().cast(dt).unwrap() - }, + #[cfg(feature = "dtype-duration")] + dt @ Duration(_) => self + .to_physical_repr() + .agg_median(groups) + .cast(&Int64) + .unwrap() + .cast(dt) + .unwrap(), + #[cfg(feature = "dtype-time")] + Time => self + .to_physical_repr() + .agg_median(groups) + .cast(&Int64) + .unwrap() + .cast(&Time) + .unwrap(), + #[cfg(feature = "dtype-date")] + Date => (self + .to_physical_repr() + .agg_median(groups) + .cast(&Float64) + .unwrap() + * (MS_IN_DAY as f64)) + .cast(&Datetime(TimeUnit::Milliseconds, None)) + .unwrap(), _ => Series::full_null("", groups.len(), s.dtype()), } } @@ -197,41 +266,6 @@ impl Series { } } - #[doc(hidden)] - pub unsafe fn agg_mean(&self, groups: &GroupsProxy) -> Series { - // Prevent a rechunk for every individual group. - let s = if groups.len() > 1 { - self.rechunk() - } else { - self.clone() - }; - - use DataType::*; - match s.dtype() { - Boolean => s.cast(&Float64).unwrap().agg_mean(groups), - Float32 => SeriesWrap(s.f32().unwrap().clone()).agg_mean(groups), - Float64 => SeriesWrap(s.f64().unwrap().clone()).agg_mean(groups), - dt if dt.is_numeric() => apply_method_physical_integer!(s, agg_mean, groups), - #[cfg(feature = "dtype-datetime")] - dt @ (Datetime(_, _) | Duration(_) | Time) => s - .to_physical_repr() - .agg_mean(groups) - .cast(&Int64) - .unwrap() - .cast(dt) - .unwrap(), - dt @ Date => { - let ca = s.to_physical_repr(); - let physical_type = ca.dtype(); - let s = apply_method_physical_integer!(ca, agg_mean, groups); - // back to physical and then - // back to logical type - s.cast(physical_type).unwrap().cast(dt).unwrap() - }, - _ => Series::full_null("", groups.len(), s.dtype()), - } - } - #[doc(hidden)] pub unsafe fn agg_last(&self, groups: &GroupsProxy) -> Series { // Prevent a rechunk for every individual group. diff --git a/crates/polars-core/src/series/implementations/date.rs b/crates/polars-core/src/series/implementations/date.rs index 735a4b3070566..a50afbab992d8 100644 --- a/crates/polars-core/src/series/implementations/date.rs +++ b/crates/polars-core/src/series/implementations/date.rs @@ -325,11 +325,14 @@ impl SeriesTrait for SeriesWrap { } fn median_reduce(&self) -> PolarsResult { - let av = AnyValue::from(self.median().map(|v| v as i64)) - .cast(self.dtype()) - .into_static() - .unwrap(); - Ok(Scalar::new(self.dtype().clone(), av)) + let av: AnyValue = self + .median() + .map(|v| (v * (MS_IN_DAY as f64)) as i64) + .into(); + Ok(Scalar::new( + DataType::Datetime(TimeUnit::Milliseconds, None), + av, + )) } fn clone_inner(&self) -> Arc { diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 4de7924fa334c..fd35ec2e509e9 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -912,7 +912,26 @@ impl Series { let val = self.mean(); Scalar::new(DataType::Float64, val.into()) }, - dt if dt.is_temporal() => { + #[cfg(feature = "dtype-date")] + DataType::Date => { + let val = self.mean().map(|v| (v * MS_IN_DAY as f64) as i64); + let av: AnyValue = val.into(); + Scalar::new(DataType::Datetime(TimeUnit::Milliseconds, None), av) + }, + #[cfg(feature = "dtype-datetime")] + dt @ DataType::Datetime(_, _) => { + let val = self.mean().map(|v| v as i64); + let av: AnyValue = val.into(); + Scalar::new(dt.clone(), av) + }, + #[cfg(feature = "dtype-duration")] + dt @ DataType::Duration(_) => { + let val = self.mean().map(|v| v as i64); + let av: AnyValue = val.into(); + Scalar::new(dt.clone(), av) + }, + #[cfg(feature = "dtype-time")] + dt @ DataType::Time => { let val = self.mean().map(|v| v as i64); let av: AnyValue = val.into(); Scalar::new(dt.clone(), av) diff --git a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs index e928e2ba8a082..c3bc052d4b2a2 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/convert.rs @@ -241,7 +241,7 @@ where #[cfg(feature = "dtype-categorical")] if matches!( logical_dtype, - DataType::Categorical(_, _) | DataType::Enum(_, _) + DataType::Categorical(_, _) | DataType::Enum(_, _) | DataType::Date ) { return ( logical_dtype.clone(), diff --git a/crates/polars-plan/src/logical_plan/aexpr/schema.rs b/crates/polars-plan/src/logical_plan/aexpr/schema.rs index c339fac4ad9ea..8abe59d363abb 100644 --- a/crates/polars-plan/src/logical_plan/aexpr/schema.rs +++ b/crates/polars-plan/src/logical_plan/aexpr/schema.rs @@ -135,13 +135,19 @@ impl AExpr { Median(expr) => { let mut field = arena.get(*expr).to_field(schema, Context::Default, arena)?; - float_type(&mut field); + match field.dtype { + Date => field.coerce(Datetime(TimeUnit::Milliseconds, None)), + _ => float_type(&mut field), + } Ok(field) }, Mean(expr) => { let mut field = arena.get(*expr).to_field(schema, Context::Default, arena)?; - float_type(&mut field); + match field.dtype { + Date => field.coerce(Datetime(TimeUnit::Milliseconds, None)), + _ => float_type(&mut field), + } Ok(field) }, Implode(expr) => { diff --git a/crates/polars-plan/src/logical_plan/conversion/dsl_to_ir.rs b/crates/polars-plan/src/logical_plan/conversion/dsl_to_ir.rs index 33be0ded29aba..f5f7bb959e91f 100644 --- a/crates/polars-plan/src/logical_plan/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/logical_plan/conversion/dsl_to_ir.rs @@ -471,16 +471,7 @@ pub fn to_alp_impl( &input_schema, ), StatsFunction::Mean => stats_helper( - |dt| { - dt.is_numeric() - || matches!( - dt, - DataType::Boolean - | DataType::Duration(_) - | DataType::Datetime(_, _) - | DataType::Time - ) - }, + |dt| dt.is_numeric() || dt.is_temporal() || dt == &DataType::Boolean, |name| col(name).mean(), &input_schema, ), @@ -500,16 +491,7 @@ pub fn to_alp_impl( stats_helper(|dt| dt.is_ord(), |name| col(name).max(), &input_schema) }, StatsFunction::Median => stats_helper( - |dt| { - dt.is_numeric() - || matches!( - dt, - DataType::Boolean - | DataType::Duration(_) - | DataType::Datetime(_, _) - | DataType::Time - ) - }, + |dt| dt.is_numeric() || dt.is_temporal() || dt == &DataType::Boolean, |name| col(name).median(), &input_schema, ), diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index bae550a575628..c5fa988c5a14b 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -4411,21 +4411,21 @@ def describe( >>> df.describe() shape: (9, 7) - ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ - │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ - │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ - │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ - │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ - │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ - │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 ┆ 23:15:10 │ - │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ - └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ + ┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 ┆ 23:15:10 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────┴──────────┘ Customize which percentiles are displayed, applying linear interpolation: @@ -4435,24 +4435,24 @@ def describe( ... interpolation="linear", ... ) shape: (11, 7) - ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ - │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ - │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ - │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ - │ 10% ┆ 1.36 ┆ 41.0 ┆ null ┆ null ┆ 2020-04-20 ┆ 11:13:34 │ - │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 ┆ 12:59:42 │ - │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ - │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 ┆ 18:09:34 │ - │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 ┆ 21:33:18 │ - │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ - └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ - """ + ┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 10% ┆ 1.36 ┆ 41.0 ┆ null ┆ null ┆ 2020-04-20 ┆ 11:13:34 │ + │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 ┆ 12:59:42 │ + │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 ┆ 18:09:34 │ + │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 ┆ 21:33:18 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────┴──────────┘ + """ # noqa: W505 if not self.columns: msg = "cannot describe a DataFrame that has no columns" raise TypeError(msg) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 031e8afabd2c6..93b4b78533f69 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -772,21 +772,21 @@ def describe( >>> lf.describe() shape: (9, 7) - ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ - │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ - │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ - │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ - │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ - │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ - │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 ┆ 23:15:10 │ - │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ - └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ + ┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 ┆ 23:15:10 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────┴──────────┘ Customize which percentiles are displayed, applying linear interpolation: @@ -796,24 +796,24 @@ def describe( ... interpolation="linear", ... ) shape: (11, 7) - ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ - │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ - │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ - │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ - │ 10% ┆ 1.36 ┆ 41.0 ┆ null ┆ null ┆ 2020-04-20 ┆ 11:13:34 │ - │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 ┆ 12:59:42 │ - │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ - │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 ┆ 18:09:34 │ - │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 ┆ 21:33:18 │ - │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ - └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ - """ + ┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 10% ┆ 1.36 ┆ 41.0 ┆ null ┆ null ┆ 2020-04-20 ┆ 11:13:34 │ + │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 ┆ 12:59:42 │ + │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 ┆ 18:09:34 │ + │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 ┆ 21:33:18 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────┴──────────┘ + """ # noqa: W505 from polars.convert import from_dict if not self.columns: @@ -847,9 +847,9 @@ def skip_minmax(dt: PolarsDataType) -> bool: ] # mean mean_expr = ( - F.col(c).to_physical().mean().cast(dtype) - if is_temporal - else (F.col(c).mean() if is_numeric or dtype == Boolean else null) + F.col(c).mean() + if is_temporal or is_numeric or dtype == Boolean + else null ) # standard deviation, min, max @@ -910,9 +910,11 @@ def skip_minmax(dt: PolarsDataType) -> bool: # cast by column type (numeric/bool -> float), (other -> string) for c in self.columns: summary[c] = [ # type: ignore[assignment] - None - if (v is None or isinstance(v, dict)) - else (float(v) if (c in has_numeric_result) else str(v)) + ( + None + if (v is None or isinstance(v, dict)) + else (float(v) if (c in has_numeric_result) else str(v)) + ) for v in summary[c] ] diff --git a/py-polars/polars/series/datetime.py b/py-polars/polars/series/datetime.py index e62d421cc2ba8..3165085ca3c9f 100644 --- a/py-polars/polars/series/datetime.py +++ b/py-polars/polars/series/datetime.py @@ -2,11 +2,9 @@ from typing import TYPE_CHECKING, Iterable -from polars._utils.convert import to_py_date, to_py_datetime from polars._utils.deprecation import deprecate_function from polars._utils.unstable import unstable from polars._utils.wrap import wrap_s -from polars.datatypes import Date, Datetime, Duration, Time from polars.series.utils import expr_dispatch if TYPE_CHECKING: @@ -156,13 +154,16 @@ def max(self) -> dt.date | dt.datetime | dt.timedelta | None: """ return wrap_s(self._s).max() # type: ignore[return-value] - def median(self) -> TemporalLiteral | float | None: + def median(self) -> TemporalLiteral | None: """ Return median as python DateTime. Examples -------- - >>> from datetime import datetime + >>> from datetime import date, datetime + >>> s = pl.Series([date(2001, 1, 1), date(2001, 1, 2)]) + >>> s.dt.median() + datetime.datetime(2001, 1, 1, 12, 0) >>> date = pl.datetime_range( ... datetime(2001, 1, 1), datetime(2001, 1, 3), "1d", eager=True ... ).alias("datetime") @@ -177,40 +178,25 @@ def median(self) -> TemporalLiteral | float | None: >>> date.dt.median() datetime.datetime(2001, 1, 2, 0, 0) """ - s = wrap_s(self._s) - out = s.median() - if out is not None: - if s.dtype == Date: - return to_py_date(int(out)) # type: ignore[arg-type] - elif s.dtype in (Datetime, Duration, Time): - return out # type: ignore[return-value] - else: - return to_py_datetime(int(out), s.dtype.time_unit) # type: ignore[arg-type, attr-defined] - return None - - def mean(self) -> TemporalLiteral | float | None: + return self._s.median() + + def mean(self) -> TemporalLiteral | None: """ Return mean as python DateTime. Examples -------- - >>> from datetime import datetime + >>> from datetime import date, datetime + >>> s = pl.Series([date(2001, 1, 1), date(2001, 1, 2)]) + >>> s.dt.mean() + datetime.datetime(2001, 1, 1, 12, 0) >>> s = pl.Series( ... [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 3)] ... ) >>> s.dt.mean() datetime.datetime(2001, 1, 2, 0, 0) """ - s = wrap_s(self._s) - out = s.mean() - if out is not None: - if s.dtype == Date: - return to_py_date(int(out)) # type: ignore[arg-type] - elif s.dtype in (Datetime, Duration, Time): - return out # type: ignore[return-value] - else: - return to_py_datetime(int(out), s.dtype.time_unit) # type: ignore[arg-type, attr-defined] - return None + return self._s.mean() def to_string(self, format: str) -> Series: """ diff --git a/py-polars/src/series/aggregation.rs b/py-polars/src/series/aggregation.rs index 5874ec08ac386..d8a5364afe3bd 100644 --- a/py-polars/src/series/aggregation.rs +++ b/py-polars/src/series/aggregation.rs @@ -1,4 +1,5 @@ use pyo3::prelude::*; +use DataType::*; use crate::error::PyPolarsErr; use crate::prelude::*; @@ -44,7 +45,7 @@ impl PySeries { fn mean(&self, py: Python) -> PyResult { match self.series.dtype() { - DataType::Boolean => Ok(Wrap( + Boolean => Ok(Wrap( self.series .cast(&DataType::UInt8) .unwrap() @@ -52,7 +53,8 @@ impl PySeries { .as_any_value(), ) .into_py(py)), - DataType::Datetime(_, _) | DataType::Duration(_) | DataType::Time => { + // For non-numeric output types we require mean_reduce. + dt if dt.is_temporal() => { Ok(Wrap(self.series.mean_reduce().as_any_value()).into_py(py)) }, _ => Ok(self.series.mean().into_py(py)), @@ -61,7 +63,7 @@ impl PySeries { fn median(&self, py: Python) -> PyResult { match self.series.dtype() { - DataType::Boolean => Ok(Wrap( + Boolean => Ok(Wrap( self.series .cast(&DataType::UInt8) .unwrap() @@ -70,7 +72,8 @@ impl PySeries { .as_any_value(), ) .into_py(py)), - DataType::Datetime(_, _) | DataType::Duration(_) | DataType::Time => Ok(Wrap( + // For non-numeric output types we require median_reduce. + dt if dt.is_temporal() => Ok(Wrap( self.series .median_reduce() .map_err(PyPolarsErr::from)? diff --git a/py-polars/tests/unit/dataframe/test_describe.py b/py-polars/tests/unit/dataframe/test_describe.py index 2c38aec302e93..edeee04ae3ed3 100644 --- a/py-polars/tests/unit/dataframe/test_describe.py +++ b/py-polars/tests/unit/dataframe/test_describe.py @@ -75,7 +75,7 @@ def test_df_describe(lazy: bool) -> None: "g": [ "3", "0", - "2021-07-02", + "2021-07-02 16:00:00", None, "2020-01-01", "2021-07-05", diff --git a/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py b/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py index 9fb3a12d1030a..6312b2a44baed 100644 --- a/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py +++ b/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py @@ -997,9 +997,9 @@ def test_weekday(time_unit: TimeUnit) -> None: [ ([], None), ([None, None], None), - ([date(2022, 1, 1)], date(2022, 1, 1)), - ([date(2022, 1, 1), date(2022, 1, 2), date(2022, 1, 3)], date(2022, 1, 2)), - ([date(2022, 1, 1), date(2022, 1, 2), date(2024, 5, 15)], date(2022, 1, 2)), + ([date(2022, 1, 1)], datetime(2022, 1, 1)), + ([date(2022, 1, 1), date(2022, 1, 2), date(2022, 1, 4)], datetime(2022, 1, 2)), + ([date(2022, 1, 1), date(2022, 1, 2), date(2024, 5, 15)], datetime(2022, 1, 2)), ([datetime(2022, 1, 1)], datetime(2022, 1, 1)), ( [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)], @@ -1048,9 +1048,15 @@ def test_median( [ ([], None), ([None, None], None), - ([date(2022, 1, 1)], date(2022, 1, 1)), - ([date(2022, 1, 1), date(2022, 1, 2), date(2022, 1, 3)], date(2022, 1, 2)), - ([date(2022, 1, 1), date(2022, 1, 2), date(2024, 5, 15)], date(2022, 10, 16)), + ([date(2022, 1, 1)], datetime(2022, 1, 1)), + ( + [date(2022, 1, 1), date(2022, 1, 2), date(2022, 1, 4)], + datetime(2022, 1, 2, 8), + ), + ( + [date(2022, 1, 1), date(2022, 1, 2), date(2024, 5, 15)], + datetime(2022, 10, 16, 16, 0), + ), ([datetime(2022, 1, 1)], datetime(2022, 1, 1)), ( [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)], @@ -1177,6 +1183,10 @@ def test_duration_median_with_tu( def test_agg_mean_expr() -> None: df = pl.DataFrame( { + "date": pl.Series( + [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 4)], + dtype=pl.Date, + ), "datetime_ms": pl.Series( [datetime(2023, 1, 1), datetime(2023, 1, 2), datetime(2023, 1, 4)], dtype=pl.Datetime("ms"), @@ -1210,6 +1220,7 @@ def test_agg_mean_expr() -> None: expected = pl.DataFrame( { + "date": pl.Series([datetime(2023, 1, 2, 8, 0)], dtype=pl.Datetime("ms")), "datetime_ms": pl.Series( [datetime(2023, 1, 2, 8, 0, 0)], dtype=pl.Datetime("ms") ), @@ -1238,6 +1249,10 @@ def test_agg_mean_expr() -> None: def test_agg_median_expr() -> None: df = pl.DataFrame( { + "date": pl.Series( + [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 4)], + dtype=pl.Date, + ), "datetime_ms": pl.Series( [datetime(2023, 1, 1), datetime(2023, 1, 2), datetime(2023, 1, 4)], dtype=pl.Datetime("ms"), @@ -1271,6 +1286,7 @@ def test_agg_median_expr() -> None: expected = pl.DataFrame( { + "date": pl.Series([datetime(2023, 1, 2)], dtype=pl.Datetime("ms")), "datetime_ms": pl.Series([datetime(2023, 1, 2)], dtype=pl.Datetime("ms")), "datetime_us": pl.Series([datetime(2023, 1, 2)], dtype=pl.Datetime("us")), "datetime_ns": pl.Series([datetime(2023, 1, 2)], dtype=pl.Datetime("ns")), diff --git a/py-polars/tests/unit/operations/test_group_by.py b/py-polars/tests/unit/operations/test_group_by.py index 6bba541e0613a..e95ea2703970d 100644 --- a/py-polars/tests/unit/operations/test_group_by.py +++ b/py-polars/tests/unit/operations/test_group_by.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections import OrderedDict -from datetime import datetime, timedelta +from datetime import date, datetime, timedelta from typing import TYPE_CHECKING, Any import numpy as np @@ -67,6 +67,12 @@ def test_group_by() -> None: ([1, 2, 3, 4], [2, 4], pl.Float32, pl.Float32), ([1, 2, 3, 4], [2, 4], pl.Float64, pl.Float64), ([False, True, True, True], [2 / 3, 1], pl.Boolean, pl.Float64), + ( + [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 4), date(2023, 1, 5)], + [datetime(2023, 1, 2, 8, 0, 0), datetime(2023, 1, 5)], + pl.Date, + pl.Datetime("ms"), + ), ( [ datetime(2023, 1, 1), @@ -158,6 +164,12 @@ def test_group_by_mean_by_dtype( ([1, 2, 4, 5], [2, 5], pl.Float32, pl.Float32), ([1, 2, 4, 5], [2, 5], pl.Float64, pl.Float64), ([False, True, True, True], [1, 1], pl.Boolean, pl.Float64), + ( + [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 4), date(2023, 1, 5)], + [datetime(2023, 1, 2), datetime(2023, 1, 5)], + pl.Date, + pl.Datetime("ms"), + ), ( [ datetime(2023, 1, 1), diff --git a/py-polars/tests/unit/series/test_describe.py b/py-polars/tests/unit/series/test_describe.py index cdf1804232312..be0641a02fb7a 100644 --- a/py-polars/tests/unit/series/test_describe.py +++ b/py-polars/tests/unit/series/test_describe.py @@ -81,7 +81,7 @@ def test_series_describe_date() -> None: stats = { "count": "3", "null_count": "0", - "mean": "2010-09-29", + "mean": "2010-09-29 16:00:00", "min": "1999-12-31", "25%": "2005-08-05", "50%": "2011-03-11", diff --git a/py-polars/tests/unit/streaming/test_streaming_group_by.py b/py-polars/tests/unit/streaming/test_streaming_group_by.py index 5e18040db3983..2540cd7c0fd1a 100644 --- a/py-polars/tests/unit/streaming/test_streaming_group_by.py +++ b/py-polars/tests/unit/streaming/test_streaming_group_by.py @@ -65,7 +65,8 @@ def test_streaming_group_by_types() -> None: pl.col("bool").mean().alias("bool_mean"), pl.col("bool").sum().alias("bool_sum"), pl.col("date").sum().alias("date_sum"), - pl.col("date").mean().alias("date_mean"), + # Date streaming mean/median has been temporarily disabled + # pl.col("date").mean().alias("date_mean"), pl.col("date").first().alias("date_first"), pl.col("date").last().alias("date_last"), pl.col("date").min().alias("date_min"), @@ -86,7 +87,7 @@ def test_streaming_group_by_types() -> None: "bool_mean": pl.Float64, "bool_sum": pl.UInt32, "date_sum": pl.Date, - "date_mean": pl.Date, + # "date_mean": pl.Date, "date_first": pl.Date, "date_last": pl.Date, "date_min": pl.Date, @@ -103,7 +104,8 @@ def test_streaming_group_by_types() -> None: "bool_mean": [0.5], "bool_sum": [1], "date_sum": [date(2074, 1, 1)], - "date_mean": [date(2022, 1, 1)], + # Date streaming mean/median has been temporarily disabled + # "date_mean": [date(2022, 1, 1)], "date_first": [date(2022, 1, 1)], "date_last": [date(2022, 1, 1)], "date_min": [date(2022, 1, 1)],