diff --git a/py-polars/docs/source/reference/dataframe/group_by.rst b/py-polars/docs/source/reference/dataframe/group_by.rst index 5855d518f492..f5cdbe675fc7 100644 --- a/py-polars/docs/source/reference/dataframe/group_by.rst +++ b/py-polars/docs/source/reference/dataframe/group_by.rst @@ -16,6 +16,7 @@ This namespace is available after calling :code:`DataFrame.group_by(...)`. GroupBy.first GroupBy.head GroupBy.last + GroupBy.map_groups GroupBy.max GroupBy.mean GroupBy.median diff --git a/py-polars/docs/source/reference/dataframe/miscellaneous.rst b/py-polars/docs/source/reference/dataframe/miscellaneous.rst index c10fb4095296..116c1d577231 100644 --- a/py-polars/docs/source/reference/dataframe/miscellaneous.rst +++ b/py-polars/docs/source/reference/dataframe/miscellaneous.rst @@ -10,3 +10,4 @@ Miscellaneous DataFrame.corr DataFrame.frame_equal DataFrame.lazy + DataFrame.map_rows diff --git a/py-polars/docs/source/reference/expressions/functions.rst b/py-polars/docs/source/reference/expressions/functions.rst index f8ba9f8a3fb6..5cf2f23581df 100644 --- a/py-polars/docs/source/reference/expressions/functions.rst +++ b/py-polars/docs/source/reference/expressions/functions.rst @@ -53,6 +53,7 @@ These functions are available from the polars module root and can be used as exp last lit map + map_groups max max_horizontal mean diff --git a/py-polars/docs/source/reference/lazyframe/group_by.rst b/py-polars/docs/source/reference/lazyframe/group_by.rst index 05e786726e3a..81bb5d272ac0 100644 --- a/py-polars/docs/source/reference/lazyframe/group_by.rst +++ b/py-polars/docs/source/reference/lazyframe/group_by.rst @@ -16,6 +16,7 @@ This namespace comes available by calling `LazyFrame.group_by(..)`. LazyGroupBy.first LazyGroupBy.head LazyGroupBy.last + LazyGroupBy.map_groups LazyGroupBy.max LazyGroupBy.mean LazyGroupBy.median diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 159394cec4d0..487f5a2edf84 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -121,6 +121,7 @@ last, lit, map, + map_groups, max, max_horizontal, mean, @@ -333,6 +334,7 @@ "last", "lit", "map", + "map_groups", "mean", "median", "n_unique", diff --git a/py-polars/polars/dataframe/group_by.py b/py-polars/polars/dataframe/group_by.py index 306b2e575ea7..3b0afd389881 100644 --- a/py-polars/polars/dataframe/group_by.py +++ b/py-polars/polars/dataframe/group_by.py @@ -5,6 +5,7 @@ import polars._reexport as pl from polars import functions as F from polars.utils.convert import _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_renamed_function if TYPE_CHECKING: import sys @@ -242,7 +243,7 @@ def agg( .collect(no_optimization=True) ) - def apply(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame: + def map_groups(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame: """ Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame. @@ -273,6 +274,8 @@ def apply(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame: Examples -------- + For each color group sample two rows: + >>> df = pl.DataFrame( ... { ... "id": [0, 1, 2, 3, 4], @@ -280,23 +283,7 @@ def apply(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame: ... "shape": ["square", "triangle", "square", "triangle", "square"], ... } ... ) - >>> df - shape: (5, 3) - ┌─────┬───────┬──────────┐ - │ id ┆ color ┆ shape │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═══════╪══════════╡ - │ 0 ┆ red ┆ square │ - │ 1 ┆ green ┆ triangle │ - │ 2 ┆ green ┆ square │ - │ 3 ┆ red ┆ triangle │ - │ 4 ┆ red ┆ square │ - └─────┴───────┴──────────┘ - - For each color group sample two rows: - - >>> df.group_by("color").apply( + >>> df.group_by("color").map_groups( ... lambda group_df: group_df.sample(2) ... ) # doctest: +IGNORE_RESULT shape: (4, 3) @@ -325,15 +312,15 @@ def apply(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame: elif isinstance(self.by, Iterable) and all(isinstance(c, str) for c in self.by): by = list(self.by) # type: ignore[arg-type] else: - raise TypeError("cannot call `apply` when grouping by an expression") + raise TypeError("cannot call `map_groups` when grouping by an expression") if all(isinstance(c, str) for c in self.more_by): by.extend(self.more_by) # type: ignore[arg-type] else: - raise TypeError("cannot call `apply` when grouping by an expression") + raise TypeError("cannot call `map_groups` when grouping by an expression") return self.df.__class__._from_pydf( - self.df._df.group_by_apply(by, function, self.maintain_order) + self.df._df.group_by_map_groups(by, function, self.maintain_order) ) def head(self, n: int = 5) -> DataFrame: @@ -760,6 +747,22 @@ def sum(self) -> DataFrame: """ return self.agg(F.all().sum()) + @deprecate_renamed_function("map_groups", version="0.19.0") + def apply(self, function: Callable[[DataFrame], DataFrame]) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`GroupBy.map_groups`. + + Parameters + ---------- + function + Custom function. + + """ + return self.map_groups(function) + class RollingGroupBy: """ @@ -866,7 +869,7 @@ def agg( .collect(no_optimization=True) ) - def apply( + def map_groups( self, function: Callable[[DataFrame], DataFrame], schema: SchemaDict | None, @@ -883,7 +886,7 @@ def apply( The idiomatic way to apply custom functions over multiple columns is using: - `pl.struct([my_columns]).apply(lambda struct_series: ..)` + `pl.struct([my_columns]).map_elements(lambda struct_series: ..)` Parameters ---------- @@ -894,58 +897,6 @@ def apply( given schema is incorrect, this is a bug in the caller's query and may lead to errors. If set to None, polars assumes the schema is unchanged. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "id": [0, 1, 2, 3, 4], - ... "color": ["red", "green", "green", "red", "red"], - ... "shape": ["square", "triangle", "square", "triangle", "square"], - ... } - ... ) - >>> df - shape: (5, 3) - ┌─────┬───────┬──────────┐ - │ id ┆ color ┆ shape │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═══════╪══════════╡ - │ 0 ┆ red ┆ square │ - │ 1 ┆ green ┆ triangle │ - │ 2 ┆ green ┆ square │ - │ 3 ┆ red ┆ triangle │ - │ 4 ┆ red ┆ square │ - └─────┴───────┴──────────┘ - - For each color group sample two rows: - - >>> ( - ... df.lazy() - ... .group_by("color") - ... .apply(lambda group_df: group_df.sample(2), schema=None) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (4, 3) - ┌─────┬───────┬──────────┐ - │ id ┆ color ┆ shape │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═══════╪══════════╡ - │ 1 ┆ green ┆ triangle │ - │ 2 ┆ green ┆ square │ - │ 4 ┆ red ┆ square │ - │ 3 ┆ red ┆ triangle │ - └─────┴───────┴──────────┘ - - It is better to implement this with an expression: - - >>> ( - ... df.lazy() - ... .filter(pl.int_range(0, pl.count()).shuffle().over("color") < 2) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - """ return ( self.df.lazy() @@ -957,10 +908,34 @@ def apply( by=self.by, check_sorted=self.check_sorted, ) - .apply(function, schema) + .map_groups(function, schema) .collect(no_optimization=True) ) + @deprecate_renamed_function("map_groups", version="0.19.0") + def apply( + self, + function: Callable[[DataFrame], DataFrame], + schema: SchemaDict | None, + ) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the groups as a new DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`RollingGroupBy.map_groups`. + + Parameters + ---------- + function + Function to apply over each group of the `LazyFrame`. + schema + Schema of the output function. This has to be known statically. If the + given schema is incorrect, this is a bug in the caller's query and may + lead to errors. If set to None, polars assumes the schema is unchanged. + + """ + return self.map_groups(function, schema) + class DynamicGroupBy: """ @@ -1084,7 +1059,7 @@ def agg( .collect(no_optimization=True) ) - def apply( + def map_groups( self, function: Callable[[DataFrame], DataFrame], schema: SchemaDict | None, @@ -1101,7 +1076,7 @@ def apply( The idiomatic way to apply custom functions over multiple columns is using: - `pl.struct([my_columns]).apply(lambda struct_series: ..)` + `pl.struct([my_columns]).map_elements(lambda struct_series: ..)` Parameters ---------- @@ -1112,58 +1087,6 @@ def apply( given schema is incorrect, this is a bug in the caller's query and may lead to errors. If set to None, polars assumes the schema is unchanged. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "id": [0, 1, 2, 3, 4], - ... "color": ["red", "green", "green", "red", "red"], - ... "shape": ["square", "triangle", "square", "triangle", "square"], - ... } - ... ) - >>> df - shape: (5, 3) - ┌─────┬───────┬──────────┐ - │ id ┆ color ┆ shape │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═══════╪══════════╡ - │ 0 ┆ red ┆ square │ - │ 1 ┆ green ┆ triangle │ - │ 2 ┆ green ┆ square │ - │ 3 ┆ red ┆ triangle │ - │ 4 ┆ red ┆ square │ - └─────┴───────┴──────────┘ - - For each color group sample two rows: - - >>> ( - ... df.lazy() - ... .group_by("color") - ... .apply(lambda group_df: group_df.sample(2), schema=None) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (4, 3) - ┌─────┬───────┬──────────┐ - │ id ┆ color ┆ shape │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═══════╪══════════╡ - │ 1 ┆ green ┆ triangle │ - │ 2 ┆ green ┆ square │ - │ 4 ┆ red ┆ square │ - │ 3 ┆ red ┆ triangle │ - └─────┴───────┴──────────┘ - - It is better to implement this with an expression: - - >>> ( - ... df.lazy() - ... .filter(pl.int_range(0, pl.count()).shuffle().over("color") < 2) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - """ return ( self.df.lazy() @@ -1179,6 +1102,30 @@ def apply( start_by=self.start_by, check_sorted=self.check_sorted, ) - .apply(function, schema) + .map_groups(function, schema) .collect(no_optimization=True) ) + + @deprecate_renamed_function("map_groups", version="0.19.0") + def apply( + self, + function: Callable[[DataFrame], DataFrame], + schema: SchemaDict | None, + ) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the groups as a new DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DynamicGroupBy.map_groups`. + + Parameters + ---------- + function + Function to apply over each group of the `LazyFrame`. + schema + Schema of the output function. This has to be known statically. If the + given schema is incorrect, this is a bug in the caller's query and may + lead to errors. If set to None, polars assumes the schema is unchanged. + + """ + return self.map_groups(function, schema) diff --git a/py-polars/polars/functions/__init__.py b/py-polars/polars/functions/__init__.py index 1c780163cdf8..0cab9ebbd7fe 100644 --- a/py-polars/polars/functions/__init__.py +++ b/py-polars/polars/functions/__init__.py @@ -50,6 +50,7 @@ implode, last, map, + map_groups, mean, median, n_unique, @@ -138,6 +139,7 @@ "last", "lit", "map", + "map_groups", "mean", "median", "n_unique", diff --git a/py-polars/polars/functions/lazy.py b/py-polars/polars/functions/lazy.py index 4194d8b724db..b1f77b79bc42 100644 --- a/py-polars/polars/functions/lazy.py +++ b/py-polars/polars/functions/lazy.py @@ -1019,12 +1019,12 @@ def map( exprs = parse_as_list_of_expressions(exprs) return wrap_expr( plr.map_mul( - exprs, function, return_dtype, apply_groups=False, returns_scalar=False + exprs, function, return_dtype, map_groups=False, returns_scalar=False ) ) -def apply( +def map_groups( exprs: Sequence[str | Expr], function: Callable[[Sequence[Series]], Series | Any], return_dtype: PolarsDataType | None = None, @@ -1084,7 +1084,7 @@ def apply( └───────┴─────┴─────┘ >>> ( ... df.group_by("group").agg( - ... pl.apply( + ... pl.map_groups( ... exprs=["a", "b"], ... function=lambda list_of_series: list_of_series[0] ... / list_of_series[0].sum() @@ -1114,12 +1114,46 @@ def apply( exprs, function, return_dtype, - apply_groups=True, + map_groups=True, returns_scalar=returns_scalar, ) ) +@deprecate_renamed_function("map_groups", version="0.19.0") +def apply( + exprs: Sequence[str | Expr], + function: Callable[[Sequence[Series]], Series | Any], + return_dtype: PolarsDataType | None = None, + *, + returns_scalar: bool = True, +) -> Expr: + """ + Apply a custom/user-defined function (UDF) in a GroupBy context. + + .. deprecated:: 0.19.0 + This function has been renamed to :func:`map_groups`. + + Parameters + ---------- + exprs + Input Series to f + function + Function to apply over the input + return_dtype + dtype of the output Series + returns_scalar + If the function returns a single scalar as output. + + Returns + ------- + Expr + Expression with the data type given by ``return_dtype``. + + """ + return map_groups(exprs, function, return_dtype, returns_scalar=returns_scalar) + + def fold( acc: IntoExpr, function: Callable[[Series, Series], Series], diff --git a/py-polars/polars/lazyframe/group_by.py b/py-polars/polars/lazyframe/group_by.py index 06d700d86315..f0a6bdf6aea3 100644 --- a/py-polars/polars/lazyframe/group_by.py +++ b/py-polars/polars/lazyframe/group_by.py @@ -5,6 +5,7 @@ from polars import functions as F from polars.utils._parse_expr_input import parse_as_list_of_expressions from polars.utils._wrap import wrap_ldf +from polars.utils.deprecation import deprecate_renamed_function if TYPE_CHECKING: from polars import DataFrame, LazyFrame @@ -144,7 +145,7 @@ def agg( pyexprs = parse_as_list_of_expressions(*aggs, **named_aggs) return wrap_ldf(self.lgb.agg(pyexprs)) - def apply( + def map_groups( self, function: Callable[[DataFrame], DataFrame], schema: SchemaDict | None, @@ -176,9 +177,10 @@ def apply( given schema is incorrect, this is a bug in the caller's query and may lead to errors. If set to None, polars assumes the schema is unchanged. - Examples -------- + For each color group sample two rows: + >>> df = pl.DataFrame( ... { ... "id": [0, 1, 2, 3, 4], @@ -186,26 +188,10 @@ def apply( ... "shape": ["square", "triangle", "square", "triangle", "square"], ... } ... ) - >>> df - shape: (5, 3) - ┌─────┬───────┬──────────┐ - │ id ┆ color ┆ shape │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═══════╪══════════╡ - │ 0 ┆ red ┆ square │ - │ 1 ┆ green ┆ triangle │ - │ 2 ┆ green ┆ square │ - │ 3 ┆ red ┆ triangle │ - │ 4 ┆ red ┆ square │ - └─────┴───────┴──────────┘ - - For each color group sample two rows: - >>> ( ... df.lazy() ... .group_by("color") - ... .apply(lambda group_df: group_df.sample(2), schema=None) + ... .map_groups(lambda group_df: group_df.sample(2), schema=None) ... .collect() ... ) # doctest: +IGNORE_RESULT shape: (4, 3) @@ -229,7 +215,7 @@ def apply( ... ) # doctest: +IGNORE_RESULT """ - return wrap_ldf(self.lgb.apply(function, schema)) + return wrap_ldf(self.lgb.map_groups(function, schema)) def head(self, n: int = 5) -> LazyFrame: """ @@ -649,3 +635,27 @@ def sum(self) -> LazyFrame: """ return self.agg(F.all().sum()) + + @deprecate_renamed_function("map_groups", version="0.19.0") + def apply( + self, + function: Callable[[DataFrame], DataFrame], + schema: SchemaDict | None, + ) -> LazyFrame: + """ + Apply a custom/user-defined function (UDF) over the groups as a new DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyGroupBy.map_groups`. + + Parameters + ---------- + function + Function to apply over each group of the `LazyFrame`. + schema + Schema of the output function. This has to be known statically. If the + given schema is incorrect, this is a bug in the caller's query and may + lead to errors. If set to None, polars assumes the schema is unchanged. + + """ + return self.map_groups(function, schema) diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 570af0ac5bb3..57da8fd6fa41 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -1134,7 +1134,7 @@ impl PyDataFrame { Ok(df.into()) } - pub fn group_by_apply( + pub fn group_by_map_groups( &self, by: Vec<&str>, lambda: PyObject, diff --git a/py-polars/src/functions/lazy.rs b/py-polars/src/functions/lazy.rs index 4b6c44de131f..8bfdd329445c 100644 --- a/py-polars/src/functions/lazy.rs +++ b/py-polars/src/functions/lazy.rs @@ -388,23 +388,16 @@ pub fn lit(value: &PyAny, allow_object: bool) -> PyResult { } #[pyfunction] -#[pyo3(signature = (pyexpr, lambda, output_type, apply_groups, returns_scalar))] +#[pyo3(signature = (pyexpr, lambda, output_type, map_groups, returns_scalar))] pub fn map_mul( py: Python, pyexpr: Vec, lambda: PyObject, output_type: Option>, - apply_groups: bool, + map_groups: bool, returns_scalar: bool, ) -> PyExpr { - map::lazy::map_mul( - &pyexpr, - py, - lambda, - output_type, - apply_groups, - returns_scalar, - ) + map::lazy::map_mul(&pyexpr, py, lambda, output_type, map_groups, returns_scalar) } #[pyfunction] diff --git a/py-polars/src/lazygroupby.rs b/py-polars/src/lazygroupby.rs index a6ac381340c9..d74163b8e43b 100644 --- a/py-polars/src/lazygroupby.rs +++ b/py-polars/src/lazygroupby.rs @@ -34,7 +34,11 @@ impl PyLazyGroupBy { lgb.tail(Some(n)).into() } - fn apply(&mut self, lambda: PyObject, schema: Option>) -> PyResult { + fn map_groups( + &mut self, + lambda: PyObject, + schema: Option>, + ) -> PyResult { let lgb = self.lgb.take().unwrap(); let schema = match schema { Some(schema) => Arc::new(schema.0), diff --git a/py-polars/src/map/dataframe.rs b/py-polars/src/map/dataframe.rs index 84f61f7378e0..c567d3d397b1 100644 --- a/py-polars/src/map/dataframe.rs +++ b/py-polars/src/map/dataframe.rs @@ -171,10 +171,10 @@ where { let skip = usize::from(first_value.is_some()); if init_null_count == df.height() { - ChunkedArray::full_null("apply", df.height()) + ChunkedArray::full_null("map", df.height()) } else { let iter = apply_iter(df, py, lambda, init_null_count, skip); - iterator_to_primitive(iter, init_null_count, first_value, "apply", df.height()) + iterator_to_primitive(iter, init_null_count, first_value, "map", df.height()) } } @@ -188,10 +188,10 @@ pub fn apply_lambda_with_bool_out_type<'a>( ) -> ChunkedArray { let skip = usize::from(first_value.is_some()); if init_null_count == df.height() { - ChunkedArray::full_null("apply", df.height()) + ChunkedArray::full_null("map", df.height()) } else { let iter = apply_iter(df, py, lambda, init_null_count, skip); - iterator_to_bool(iter, init_null_count, first_value, "apply", df.height()) + iterator_to_bool(iter, init_null_count, first_value, "map", df.height()) } } @@ -205,10 +205,10 @@ pub fn apply_lambda_with_utf8_out_type<'a>( ) -> Utf8Chunked { let skip = usize::from(first_value.is_some()); if init_null_count == df.height() { - ChunkedArray::full_null("apply", df.height()) + ChunkedArray::full_null("map", df.height()) } else { let iter = apply_iter::<&str>(df, py, lambda, init_null_count, skip); - iterator_to_utf8(iter, init_null_count, first_value, "apply", df.height()) + iterator_to_utf8(iter, init_null_count, first_value, "map", df.height()) } } @@ -223,7 +223,7 @@ pub fn apply_lambda_with_list_out_type<'a>( ) -> PyResult { let skip = usize::from(first_value.is_some()); if init_null_count == df.height() { - Ok(ChunkedArray::full_null("apply", df.height())) + Ok(ChunkedArray::full_null("map", df.height())) } else { let mut iters = get_iters_skip(df, init_null_count + skip); let iter = ((init_null_count + skip)..df.height()).map(|_| { @@ -243,7 +243,7 @@ pub fn apply_lambda_with_list_out_type<'a>( Err(e) => panic!("python function failed {e}"), } }); - iterator_to_list(dt, iter, init_null_count, first_value, "apply", df.height()) + iterator_to_list(dt, iter, init_null_count, first_value, "map", df.height()) } } diff --git a/py-polars/src/map/lazy.rs b/py-polars/src/map/lazy.rs index 7b58456b36d7..7138e8c00309 100644 --- a/py-polars/src/map/lazy.rs +++ b/py-polars/src/map/lazy.rs @@ -166,7 +166,7 @@ pub fn map_mul( py: Python, lambda: PyObject, output_type: Option>, - apply_groups: bool, + map_groups: bool, returns_scalar: bool, ) -> PyExpr { // get the pypolars module @@ -179,7 +179,7 @@ pub fn map_mul( let out = call_lambda_with_series_slice(py, s, &lambda, &pypolars); // we return an error, because that will become a null value polars lazy apply list - if apply_groups && out.is_none(py) { + if map_groups && out.is_none(py) { return Ok(None); } @@ -193,7 +193,7 @@ pub fn map_mul( Some(ref dt) => Field::new(fld.name(), dt.0.clone()), None => fld.clone(), }); - if apply_groups { + if map_groups { polars::lazy::dsl::apply_multiple(function, exprs, output_map, returns_scalar).into() } else { polars::lazy::dsl::map_multiple(function, exprs, output_map).into() diff --git a/py-polars/tests/unit/operations/map/test_map.py b/py-polars/tests/unit/operations/map/test_map.py index 78a10d9dd665..b5376dbdea26 100644 --- a/py-polars/tests/unit/operations/map/test_map.py +++ b/py-polars/tests/unit/operations/map/test_map.py @@ -1,402 +1,26 @@ from __future__ import annotations -import json -from datetime import date, datetime, timedelta from functools import reduce -from typing import Any, Sequence - -import numpy as np -import pytest import polars as pl -from polars.exceptions import PolarsInefficientMapWarning from polars.testing import assert_frame_equal -def test_apply_none() -> None: - df = pl.DataFrame( - { - "g": [1, 1, 1, 2, 2, 2, 5], - "a": [2, 4, 5, 190, 1, 4, 1], - "b": [1, 3, 2, 1, 43, 3, 1], - } - ) - - out = ( - df.group_by("g", maintain_order=True).agg( - pl.apply( - exprs=["a", pl.col("b") ** 4, pl.col("a") / 4], - function=lambda x: x[0] * x[1] + x[2].sum(), - ).alias("multiple") - ) - )["multiple"] - assert out[0].to_list() == [4.75, 326.75, 82.75] - assert out[1].to_list() == [238.75, 3418849.75, 372.75] - - out_df = df.select(pl.map(exprs=["a", "b"], function=lambda s: s[0] * s[1])) - assert out_df["a"].to_list() == (df["a"] * df["b"]).to_list() - - # check if we can return None - def func(s: Sequence[pl.Series]) -> pl.Series | None: - if s[0][0] == 190: - return None - else: - return s[0] - - out = ( - df.group_by("g", maintain_order=True).agg( - pl.apply( - exprs=["a", pl.col("b") ** 4, pl.col("a") / 4], function=func - ).alias("multiple") - ) - )["multiple"] - assert out[1] is None - - def test_map_return_py_object() -> None: df = pl.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - out = df.select([pl.all().map(lambda s: reduce(lambda a, b: a + b, s))]) - assert out.rows() == [(6, 15)] - - -def test_agg_objects() -> None: - df = pl.DataFrame( - { - "names": ["foo", "ham", "spam", "cheese", "egg", "foo"], - "dates": ["1", "1", "2", "3", "3", "4"], - "groups": ["A", "A", "B", "B", "B", "C"], - } - ) - - class Foo: - def __init__(self, payload: Any): - self.payload = payload - - out = df.group_by("groups").agg( - [ - pl.apply( - [pl.col("dates"), pl.col("names")], lambda s: Foo(dict(zip(s[0], s[1]))) - ) - ] - ) - assert out.dtypes == [pl.Utf8, pl.Object] - - -def test_map_elements_infer_list() -> None: - df = pl.DataFrame( - { - "int": [1, 2], - "str": ["a", "b"], - "bool": [True, None], - } - ) - assert df.select([pl.all().map_elements(lambda x: [x])]).dtypes == [pl.List] * 3 - - -def test_map_elements_arithmetic_consistency() -> None: - df = pl.DataFrame({"A": ["a", "a"], "B": [2, 3]}) - with pytest.warns( - PolarsInefficientMapWarning, match="In this case, you can replace" - ): - assert df.group_by("A").agg(pl.col("B").map_elements(lambda x: x + 1.0))[ - "B" - ].to_list() == [[3.0, 4.0]] - - -def test_map_elements_struct() -> None: - df = pl.DataFrame( - {"A": ["a", "a"], "B": [2, 3], "C": [True, False], "D": [12.0, None]} - ) - out = df.with_columns(pl.struct(df.columns).alias("struct")).select( - pl.col("struct").map_elements(lambda x: x["A"]).alias("A_field"), - pl.col("struct").map_elements(lambda x: x["B"]).alias("B_field"), - pl.col("struct").map_elements(lambda x: x["C"]).alias("C_field"), - pl.col("struct").map_elements(lambda x: x["D"]).alias("D_field"), - ) - expected = pl.DataFrame( - { - "A_field": ["a", "a"], - "B_field": [2, 3], - "C_field": [True, False], - "D_field": [12.0, None], - } - ) - - assert_frame_equal(out, expected) - - -def test_apply_numpy_out_3057() -> None: - df = pl.DataFrame( - { - "id": [0, 0, 0, 1, 1, 1], - "t": [2.0, 4.3, 5, 10, 11, 14], - "y": [0.0, 1, 1.3, 2, 3, 4], - } - ) - result = df.group_by("id", maintain_order=True).agg( - pl.apply(["y", "t"], lambda lst: np.trapz(y=lst[0], x=lst[1])).alias("result") - ) - expected = pl.DataFrame({"id": [0, 1], "result": [1.955, 13.0]}) - assert_frame_equal(result, expected) + result = df.select([pl.all().map(lambda s: reduce(lambda a, b: a + b, s))]) -def test_map_elements_numpy_int_out() -> None: - df = pl.DataFrame({"col1": [2, 4, 8, 16]}) - result = df.with_columns( - pl.col("col1").map_elements(lambda x: np.left_shift(x, 8)).alias("result") - ) - expected = pl.DataFrame({"col1": [2, 4, 8, 16], "result": [512, 1024, 2048, 4096]}) - assert_frame_equal(result, expected) - - df = pl.DataFrame({"col1": [2, 4, 8, 16], "shift": [1, 1, 2, 2]}) - result = df.select( - pl.struct(["col1", "shift"]) - .map_elements(lambda cols: np.left_shift(cols["col1"], cols["shift"])) - .alias("result") - ) - expected = pl.DataFrame({"result": [4, 8, 32, 64]}) + expected = pl.DataFrame({"A": [6], "B": [15]}) assert_frame_equal(result, expected) -def test_datelike_identity() -> None: - for s in [ - pl.Series([datetime(year=2000, month=1, day=1)]), - pl.Series([timedelta(hours=2)]), - pl.Series([date(year=2000, month=1, day=1)]), - ]: - assert s.map_elements(lambda x: x).to_list() == s.to_list() - - -def test_map_elements_list_anyvalue_fallback() -> None: - with pytest.warns( - PolarsInefficientMapWarning, - match=r'(?s)replace your `map_elements` with.*pl.col\("text"\).str.json_extract()', - ): - df = pl.DataFrame({"text": ['[{"x": 1, "y": 2}, {"x": 3, "y": 4}]']}) - assert df.select(pl.col("text").map_elements(json.loads)).to_dict(False) == { - "text": [[{"x": 1, "y": 2}, {"x": 3, "y": 4}]] - } - - # starts with empty list '[]' - df = pl.DataFrame( - { - "text": [ - "[]", - '[{"x": 1, "y": 2}, {"x": 3, "y": 4}]', - '[{"x": 1, "y": 2}]', - ] - } - ) - assert df.select(pl.col("text").map_elements(json.loads)).to_dict(False) == { - "text": [[], [{"x": 1, "y": 2}, {"x": 3, "y": 4}], [{"x": 1, "y": 2}]] - } - - -def test_map_elements_all_types() -> None: - dtypes = [ - pl.UInt8, - pl.UInt16, - pl.UInt32, - pl.UInt64, - pl.Int8, - pl.Int16, - pl.Int32, - pl.Int64, - ] - # test we don't panic - for dtype in dtypes: - pl.Series([1, 2, 3, 4, 5], dtype=dtype).map_elements(lambda x: x) - - -def test_map_elements_type_propagation() -> None: - assert ( - pl.from_dict( - { - "a": [1, 2, 3], - "b": [{"c": 1, "d": 2}, {"c": 2, "d": 3}, {"c": None, "d": None}], - } - ) - .group_by("a", maintain_order=True) - .agg( - [ - pl.when(pl.col("b").null_count() == 0) - .then( - pl.col("b").map_elements( - lambda s: s[0]["c"], - return_dtype=pl.Float64, - ) - ) - .otherwise(None) - ] - ) - ).to_dict(False) == {"a": [1, 2, 3], "b": [1.0, 2.0, None]} - - -def test_empty_list_in_map_elements() -> None: - df = pl.DataFrame( - {"a": [[1], [1, 2], [3, 4], [5, 6]], "b": [[3], [1, 2], [1, 2], [4, 5]]} - ) - - assert df.select( - pl.struct(["a", "b"]).map_elements( - lambda row: list(set(row["a"]) & set(row["b"])) - ) - ).to_dict(False) == {"a": [[], [1, 2], [], [5]]} - - -def test_map_elements_skip_nulls() -> None: - some_map = {None: "a", 1: "b"} - s = pl.Series([None, 1]) - - assert s.map_elements(lambda x: some_map[x]).to_list() == [None, "b"] - assert s.map_elements(lambda x: some_map[x], skip_nulls=False).to_list() == [ - "a", - "b", - ] - - -def test_map_elements_object_dtypes() -> None: - with pytest.warns( - PolarsInefficientMapWarning, - match=r"(?s)replace your `map_elements` with.*lambda x:", - ): - assert pl.DataFrame( - {"a": pl.Series([1, 2, "a", 4, 5], dtype=pl.Object)} - ).with_columns( - [ - pl.col("a").map_elements(lambda x: x * 2, return_dtype=pl.Object), - pl.col("a") - .map_elements( - lambda x: isinstance(x, (int, float)), return_dtype=pl.Boolean - ) - .alias("is_numeric1"), - pl.col("a") - .map_elements(lambda x: isinstance(x, (int, float))) - .alias("is_numeric_infer"), - ] - ).to_dict( - False - ) == { - "a": [2, 4, "aa", 8, 10], - "is_numeric1": [True, True, False, True, True], - "is_numeric_infer": [True, True, False, True, True], - } - - -def test_map_elements_explicit_list_output_type() -> None: - out = pl.DataFrame({"str": ["a", "b"]}).with_columns( - [ - pl.col("str").map_elements( - lambda _: pl.Series([1, 2, 3]), return_dtype=pl.List(pl.Int64) - ) - ] - ) - - assert out.dtypes == [pl.List(pl.Int64)] - assert out.to_dict(False) == {"str": [[1, 2, 3], [1, 2, 3]]} - - -def test_map_elements_dict() -> None: - with pytest.warns( - PolarsInefficientMapWarning, - match=r'(?s)replace your `map_elements` with.*pl.col\("abc"\).str.json_extract()', - ): - df = pl.DataFrame({"abc": ['{"A":"Value1"}', '{"B":"Value2"}']}) - assert df.select(pl.col("abc").map_elements(json.loads)).to_dict(False) == { - "abc": [{"A": "Value1", "B": None}, {"A": None, "B": "Value2"}] - } - assert pl.DataFrame( - {"abc": ['{"A":"Value1", "B":"Value2"}', '{"B":"Value3"}']} - ).select(pl.col("abc").map_elements(json.loads)).to_dict(False) == { - "abc": [{"A": "Value1", "B": "Value2"}, {"A": None, "B": "Value3"}] - } - - -def test_map_elements_pass_name() -> None: - df = pl.DataFrame( - { - "bar": [1, 1, 2], - "foo": [1, 2, 3], - } - ) - - mapper = {"foo": "foo1"} - - def element_mapper(s: pl.Series) -> pl.Series: - return pl.Series([mapper[s.name]]) - - assert df.group_by("bar", maintain_order=True).agg( - pl.col("foo").map_elements(element_mapper, pass_name=True), - ).to_dict(False) == {"bar": [1, 2], "foo": [["foo1"], ["foo1"]]} - - -def test_map_elements_binary() -> None: - assert pl.DataFrame({"bin": [b"\x11" * 12, b"\x22" * 12, b"\xaa" * 12]}).select( - pl.col("bin").map_elements(bytes.hex) - ).to_dict(False) == { - "bin": [ - "111111111111111111111111", - "222222222222222222222222", - "aaaaaaaaaaaaaaaaaaaaaaaa", - ] - } - - def test_map_no_dtype_set_8531() -> None: - assert ( - pl.DataFrame({"a": [1]}) - .with_columns( - pl.col("a").map(lambda x: x * 2).shift_and_fill(fill_value=0, periods=0) - ) - .item() - == 2 - ) - - -def test_map_elements_set_datetime_output_8984() -> None: - df = pl.DataFrame({"a": [""]}) - payload = datetime(2001, 1, 1) - assert df.select( - pl.col("a").map_elements(lambda _: payload, return_dtype=pl.Datetime), - )["a"].to_list() == [payload] + df = pl.DataFrame({"a": [1]}) - -def test_map_elements_dict_order_10128() -> None: - df = pl.select(pl.lit("").map_elements(lambda x: {"c": 1, "b": 2, "a": 3})) - assert df.to_dict(False) == {"literal": [{"c": 1, "b": 2, "a": 3}]} - - -def test_map_elements_10237() -> None: - df = pl.DataFrame({"a": [1, 2, 3]}) - assert ( - df.select(pl.all().map_elements(lambda x: x > 50))["a"].to_list() == [False] * 3 - ) - - -def test_map_elements_on_empty_col_10639() -> None: - df = pl.DataFrame({"A": [], "B": []}) - res = df.group_by("B").agg( - pl.col("A") - .map_elements(lambda x: x, return_dtype=pl.Int32, strategy="threading") - .alias("Foo") - ) - assert res.to_dict(False) == { - "B": [], - "Foo": [], - } - res = df.group_by("B").agg( - pl.col("A") - .map_elements(lambda x: x, return_dtype=pl.Int32, strategy="thread_local") - .alias("Foo") + result = df.with_columns( + pl.col("a").map(lambda x: x * 2).shift_and_fill(fill_value=0, periods=0) ) - assert res.to_dict(False) == { - "B": [], - "Foo": [], - } - -def test_apply_deprecated() -> None: - with pytest.deprecated_call(): - pl.col("a").apply(lambda x: x + 1) - with pytest.deprecated_call(): - pl.Series([1, 2, 3]).apply(lambda x: x + 1) + expected = pl.DataFrame({"a": [2]}) + assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/operations/map/test_map_elements.py b/py-polars/tests/unit/operations/map/test_map_elements.py new file mode 100644 index 000000000000..a60bf6ad09af --- /dev/null +++ b/py-polars/tests/unit/operations/map/test_map_elements.py @@ -0,0 +1,305 @@ +from __future__ import annotations + +import json +from datetime import date, datetime, timedelta + +import numpy as np +import pytest + +import polars as pl +from polars.exceptions import PolarsInefficientMapWarning +from polars.testing import assert_frame_equal + + +def test_map_elements_infer_list() -> None: + df = pl.DataFrame( + { + "int": [1, 2], + "str": ["a", "b"], + "bool": [True, None], + } + ) + assert df.select([pl.all().map_elements(lambda x: [x])]).dtypes == [pl.List] * 3 + + +def test_map_elements_arithmetic_consistency() -> None: + df = pl.DataFrame({"A": ["a", "a"], "B": [2, 3]}) + with pytest.warns( + PolarsInefficientMapWarning, match="In this case, you can replace" + ): + assert df.group_by("A").agg(pl.col("B").map_elements(lambda x: x + 1.0))[ + "B" + ].to_list() == [[3.0, 4.0]] + + +def test_map_elements_struct() -> None: + df = pl.DataFrame( + {"A": ["a", "a"], "B": [2, 3], "C": [True, False], "D": [12.0, None]} + ) + out = df.with_columns(pl.struct(df.columns).alias("struct")).select( + pl.col("struct").map_elements(lambda x: x["A"]).alias("A_field"), + pl.col("struct").map_elements(lambda x: x["B"]).alias("B_field"), + pl.col("struct").map_elements(lambda x: x["C"]).alias("C_field"), + pl.col("struct").map_elements(lambda x: x["D"]).alias("D_field"), + ) + expected = pl.DataFrame( + { + "A_field": ["a", "a"], + "B_field": [2, 3], + "C_field": [True, False], + "D_field": [12.0, None], + } + ) + + assert_frame_equal(out, expected) + + +def test_map_elements_numpy_int_out() -> None: + df = pl.DataFrame({"col1": [2, 4, 8, 16]}) + result = df.with_columns( + pl.col("col1").map_elements(lambda x: np.left_shift(x, 8)).alias("result") + ) + expected = pl.DataFrame({"col1": [2, 4, 8, 16], "result": [512, 1024, 2048, 4096]}) + assert_frame_equal(result, expected) + + df = pl.DataFrame({"col1": [2, 4, 8, 16], "shift": [1, 1, 2, 2]}) + result = df.select( + pl.struct(["col1", "shift"]) + .map_elements(lambda cols: np.left_shift(cols["col1"], cols["shift"])) + .alias("result") + ) + expected = pl.DataFrame({"result": [4, 8, 32, 64]}) + assert_frame_equal(result, expected) + + +def test_datelike_identity() -> None: + for s in [ + pl.Series([datetime(year=2000, month=1, day=1)]), + pl.Series([timedelta(hours=2)]), + pl.Series([date(year=2000, month=1, day=1)]), + ]: + assert s.map_elements(lambda x: x).to_list() == s.to_list() + + +def test_map_elements_list_anyvalue_fallback() -> None: + with pytest.warns( + PolarsInefficientMapWarning, + match=r'(?s)replace your `map_elements` with.*pl.col\("text"\).str.json_extract()', + ): + df = pl.DataFrame({"text": ['[{"x": 1, "y": 2}, {"x": 3, "y": 4}]']}) + assert df.select(pl.col("text").map_elements(json.loads)).to_dict(False) == { + "text": [[{"x": 1, "y": 2}, {"x": 3, "y": 4}]] + } + + # starts with empty list '[]' + df = pl.DataFrame( + { + "text": [ + "[]", + '[{"x": 1, "y": 2}, {"x": 3, "y": 4}]', + '[{"x": 1, "y": 2}]', + ] + } + ) + assert df.select(pl.col("text").map_elements(json.loads)).to_dict(False) == { + "text": [[], [{"x": 1, "y": 2}, {"x": 3, "y": 4}], [{"x": 1, "y": 2}]] + } + + +def test_map_elements_all_types() -> None: + dtypes = [ + pl.UInt8, + pl.UInt16, + pl.UInt32, + pl.UInt64, + pl.Int8, + pl.Int16, + pl.Int32, + pl.Int64, + ] + # test we don't panic + for dtype in dtypes: + pl.Series([1, 2, 3, 4, 5], dtype=dtype).map_elements(lambda x: x) + + +def test_map_elements_type_propagation() -> None: + assert ( + pl.from_dict( + { + "a": [1, 2, 3], + "b": [{"c": 1, "d": 2}, {"c": 2, "d": 3}, {"c": None, "d": None}], + } + ) + .group_by("a", maintain_order=True) + .agg( + [ + pl.when(pl.col("b").null_count() == 0) + .then( + pl.col("b").map_elements( + lambda s: s[0]["c"], + return_dtype=pl.Float64, + ) + ) + .otherwise(None) + ] + ) + ).to_dict(False) == {"a": [1, 2, 3], "b": [1.0, 2.0, None]} + + +def test_empty_list_in_map_elements() -> None: + df = pl.DataFrame( + {"a": [[1], [1, 2], [3, 4], [5, 6]], "b": [[3], [1, 2], [1, 2], [4, 5]]} + ) + + assert df.select( + pl.struct(["a", "b"]).map_elements( + lambda row: list(set(row["a"]) & set(row["b"])) + ) + ).to_dict(False) == {"a": [[], [1, 2], [], [5]]} + + +def test_map_elements_skip_nulls() -> None: + some_map = {None: "a", 1: "b"} + s = pl.Series([None, 1]) + + assert s.map_elements(lambda x: some_map[x]).to_list() == [None, "b"] + assert s.map_elements(lambda x: some_map[x], skip_nulls=False).to_list() == [ + "a", + "b", + ] + + +def test_map_elements_object_dtypes() -> None: + with pytest.warns( + PolarsInefficientMapWarning, + match=r"(?s)replace your `map_elements` with.*lambda x:", + ): + assert pl.DataFrame( + {"a": pl.Series([1, 2, "a", 4, 5], dtype=pl.Object)} + ).with_columns( + [ + pl.col("a").map_elements(lambda x: x * 2, return_dtype=pl.Object), + pl.col("a") + .map_elements( + lambda x: isinstance(x, (int, float)), return_dtype=pl.Boolean + ) + .alias("is_numeric1"), + pl.col("a") + .map_elements(lambda x: isinstance(x, (int, float))) + .alias("is_numeric_infer"), + ] + ).to_dict( + False + ) == { + "a": [2, 4, "aa", 8, 10], + "is_numeric1": [True, True, False, True, True], + "is_numeric_infer": [True, True, False, True, True], + } + + +def test_map_elements_explicit_list_output_type() -> None: + out = pl.DataFrame({"str": ["a", "b"]}).with_columns( + [ + pl.col("str").map_elements( + lambda _: pl.Series([1, 2, 3]), return_dtype=pl.List(pl.Int64) + ) + ] + ) + + assert out.dtypes == [pl.List(pl.Int64)] + assert out.to_dict(False) == {"str": [[1, 2, 3], [1, 2, 3]]} + + +def test_map_elements_dict() -> None: + with pytest.warns( + PolarsInefficientMapWarning, + match=r'(?s)replace your `map_elements` with.*pl.col\("abc"\).str.json_extract()', + ): + df = pl.DataFrame({"abc": ['{"A":"Value1"}', '{"B":"Value2"}']}) + assert df.select(pl.col("abc").map_elements(json.loads)).to_dict(False) == { + "abc": [{"A": "Value1", "B": None}, {"A": None, "B": "Value2"}] + } + assert pl.DataFrame( + {"abc": ['{"A":"Value1", "B":"Value2"}', '{"B":"Value3"}']} + ).select(pl.col("abc").map_elements(json.loads)).to_dict(False) == { + "abc": [{"A": "Value1", "B": "Value2"}, {"A": None, "B": "Value3"}] + } + + +def test_map_elements_pass_name() -> None: + df = pl.DataFrame( + { + "bar": [1, 1, 2], + "foo": [1, 2, 3], + } + ) + + mapper = {"foo": "foo1"} + + def element_mapper(s: pl.Series) -> pl.Series: + return pl.Series([mapper[s.name]]) + + assert df.group_by("bar", maintain_order=True).agg( + pl.col("foo").map_elements(element_mapper, pass_name=True), + ).to_dict(False) == {"bar": [1, 2], "foo": [["foo1"], ["foo1"]]} + + +def test_map_elements_binary() -> None: + assert pl.DataFrame({"bin": [b"\x11" * 12, b"\x22" * 12, b"\xaa" * 12]}).select( + pl.col("bin").map_elements(bytes.hex) + ).to_dict(False) == { + "bin": [ + "111111111111111111111111", + "222222222222222222222222", + "aaaaaaaaaaaaaaaaaaaaaaaa", + ] + } + + +def test_map_elements_set_datetime_output_8984() -> None: + df = pl.DataFrame({"a": [""]}) + payload = datetime(2001, 1, 1) + assert df.select( + pl.col("a").map_elements(lambda _: payload, return_dtype=pl.Datetime), + )["a"].to_list() == [payload] + + +def test_map_elements_dict_order_10128() -> None: + df = pl.select(pl.lit("").map_elements(lambda x: {"c": 1, "b": 2, "a": 3})) + assert df.to_dict(False) == {"literal": [{"c": 1, "b": 2, "a": 3}]} + + +def test_map_elements_10237() -> None: + df = pl.DataFrame({"a": [1, 2, 3]}) + assert ( + df.select(pl.all().map_elements(lambda x: x > 50))["a"].to_list() == [False] * 3 + ) + + +def test_map_elements_on_empty_col_10639() -> None: + df = pl.DataFrame({"A": [], "B": []}) + res = df.group_by("B").agg( + pl.col("A") + .map_elements(lambda x: x, return_dtype=pl.Int32, strategy="threading") + .alias("Foo") + ) + assert res.to_dict(False) == { + "B": [], + "Foo": [], + } + res = df.group_by("B").agg( + pl.col("A") + .map_elements(lambda x: x, return_dtype=pl.Int32, strategy="thread_local") + .alias("Foo") + ) + assert res.to_dict(False) == { + "B": [], + "Foo": [], + } + + +def test_apply_deprecated() -> None: + with pytest.deprecated_call(): + pl.col("a").apply(lambda x: x + 1) + with pytest.deprecated_call(): + pl.Series([1, 2, 3]).apply(lambda x: x + 1) diff --git a/py-polars/tests/unit/operations/map/test_map_groups.py b/py-polars/tests/unit/operations/map/test_map_groups.py new file mode 100644 index 000000000000..78ed33e0b012 --- /dev/null +++ b/py-polars/tests/unit/operations/map/test_map_groups.py @@ -0,0 +1,169 @@ +from __future__ import annotations + +from typing import Any, Sequence + +import numpy as np +import pytest + +import polars as pl +from polars.testing import assert_frame_equal + + +def test_map_groups() -> None: + df = pl.DataFrame( + { + "a": ["a", "b", "a", "b", "b", "c"], + "b": [1, 2, 3, 4, 5, 6], + "c": [6, 5, 4, 3, 2, 1], + } + ) + + result = df.group_by("a").map_groups(lambda df: df[["c"]].sum()) + + expected = pl.DataFrame({"c": [10, 10, 1]}) + assert_frame_equal(result, expected, check_row_order=False) + + +def test_map_groups_lazy() -> None: + lf = pl.LazyFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 3.0]}) + + schema = {"a": pl.Float64, "b": pl.Float64} + result = lf.group_by("a").map_groups(lambda df: df * 2.0, schema=schema) + + expected = pl.LazyFrame({"a": [6.0, 2.0, 2.0], "b": [6.0, 2.0, 4.0]}) + assert_frame_equal(result, expected, check_row_order=False) + assert result.schema == expected.schema + + +def test_map_groups_rolling() -> None: + df = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "b": [1, 2, 3, 4, 5], + } + ).set_sorted("a") + + def function(df: pl.DataFrame) -> pl.DataFrame: + return df.select( + pl.col("a").min(), + pl.col("b").max(), + ) + + result = df.group_by_rolling("a", period="2i").map_groups( + function, schema=df.schema + ) + + expected = pl.DataFrame( + [ + pl.Series("a", [1, 1, 2, 3, 4], dtype=pl.Int64), + pl.Series("b", [1, 2, 3, 4, 5], dtype=pl.Int64), + ] + ) + assert_frame_equal(result, expected) + + +def test_map_groups_empty() -> None: + df = pl.DataFrame(schema={"x": pl.Int64}) + with pytest.raises( + pl.ComputeError, match=r"cannot group_by \+ apply on empty 'DataFrame'" + ): + df.group_by("x").map_groups(lambda x: x) + + +def test_map_groups_none() -> None: + df = pl.DataFrame( + { + "g": [1, 1, 1, 2, 2, 2, 5], + "a": [2, 4, 5, 190, 1, 4, 1], + "b": [1, 3, 2, 1, 43, 3, 1], + } + ) + + out = ( + df.group_by("g", maintain_order=True).agg( + pl.map_groups( + exprs=["a", pl.col("b") ** 4, pl.col("a") / 4], + function=lambda x: x[0] * x[1] + x[2].sum(), + ).alias("multiple") + ) + )["multiple"] + assert out[0].to_list() == [4.75, 326.75, 82.75] + assert out[1].to_list() == [238.75, 3418849.75, 372.75] + + out_df = df.select(pl.map(exprs=["a", "b"], function=lambda s: s[0] * s[1])) + assert out_df["a"].to_list() == (df["a"] * df["b"]).to_list() + + # check if we can return None + def func(s: Sequence[pl.Series]) -> pl.Series | None: + if s[0][0] == 190: + return None + else: + return s[0] + + out = ( + df.group_by("g", maintain_order=True).agg( + pl.map_groups( + exprs=["a", pl.col("b") ** 4, pl.col("a") / 4], function=func + ).alias("multiple") + ) + )["multiple"] + assert out[1] is None + + +def test_map_groups_object_output() -> None: + df = pl.DataFrame( + { + "names": ["foo", "ham", "spam", "cheese", "egg", "foo"], + "dates": ["1", "1", "2", "3", "3", "4"], + "groups": ["A", "A", "B", "B", "B", "C"], + } + ) + + class Foo: + def __init__(self, payload: Any): + self.payload = payload + + result = df.group_by("groups").agg( + pl.map_groups( + [pl.col("dates"), pl.col("names")], lambda s: Foo(dict(zip(s[0], s[1]))) + ) + ) + + assert result.dtypes == [pl.Utf8, pl.Object] + + +def test_map_groups_numpy_output_3057() -> None: + df = pl.DataFrame( + { + "id": [0, 0, 0, 1, 1, 1], + "t": [2.0, 4.3, 5, 10, 11, 14], + "y": [0.0, 1, 1.3, 2, 3, 4], + } + ) + + result = df.group_by("id", maintain_order=True).agg( + pl.map_groups(["y", "t"], lambda lst: np.trapz(y=lst[0], x=lst[1])).alias( + "result" + ) + ) + + expected = pl.DataFrame({"id": [0, 1], "result": [1.955, 13.0]}) + assert_frame_equal(result, expected) + + +def test_apply_deprecated() -> None: + df = pl.DataFrame( + { + "a": [1, 1, 2, 2, 3], + "b": [1, 2, 3, 4, 5], + } + ).set_sorted("a") + + with pytest.deprecated_call(): + df.group_by("a").apply(lambda x: x) + with pytest.deprecated_call(): + df.group_by_rolling("a", period="2i").apply(lambda x: x, schema=None) + with pytest.deprecated_call(): + df.group_by_dynamic("a", every="2i").apply(lambda x: x, schema=None) + with pytest.deprecated_call(): + pl.apply(["a", "b"], lambda x: x) diff --git a/py-polars/tests/unit/operations/map/test_map_rows.py b/py-polars/tests/unit/operations/map/test_map_rows.py index 998c83dbf67c..bb53dfa64262 100644 --- a/py-polars/tests/unit/operations/map/test_map_rows.py +++ b/py-polars/tests/unit/operations/map/test_map_rows.py @@ -13,7 +13,7 @@ def test_map_rows() -> None: result = df.map_rows(lambda x: len(x), None) - expected = pl.DataFrame({"apply": [3, 3]}) + expected = pl.DataFrame({"map": [3, 3]}) assert_frame_equal(result, expected) @@ -22,7 +22,7 @@ def test_map_rows_list_return() -> None: result = df.map_rows(lambda r: pl.Series(range(r[0], r[1] + 1))) - expected = pl.DataFrame({"apply": [[1, 2, 3], [2, 3, 4, 5]]}) + expected = pl.DataFrame({"map": [[1, 2, 3], [2, 3, 4, 5]]}) assert_frame_equal(result, expected) @@ -74,5 +74,5 @@ def test_apply_deprecated() -> None: with pytest.deprecated_call(): result = df.apply(lambda x: len(x), None) - expected = pl.DataFrame({"apply": [3, 3]}) + expected = pl.DataFrame({"map": [3, 3]}) assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/operations/test_group_by.py b/py-polars/tests/unit/operations/test_group_by.py index 2be46e4dbde3..99a14b35e2a1 100644 --- a/py-polars/tests/unit/operations/test_group_by.py +++ b/py-polars/tests/unit/operations/test_group_by.py @@ -26,8 +26,6 @@ def test_group_by() -> None: } ) - assert df.group_by("a").apply(lambda df: df[["c"]].sum()).sort("c")["c"][0] == 1 - # Use lazy API in eager group_by assert sorted(df.group_by("a").agg([pl.sum("b")]).rows()) == [ ("a", 4), diff --git a/py-polars/tests/unit/operations/test_group_by_rolling.py b/py-polars/tests/unit/operations/test_group_by_rolling.py index c976e147cf72..fb170e4f86bf 100644 --- a/py-polars/tests/unit/operations/test_group_by_rolling.py +++ b/py-polars/tests/unit/operations/test_group_by_rolling.py @@ -24,31 +24,6 @@ def good_agg_parameters() -> list[pl.Expr | list[pl.Expr]]: ] -def test_group_by_rolling_map() -> None: - df = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [1, 2, 3, 4, 5], - } - ).set_sorted("a") - - def apply(df: pl.DataFrame) -> pl.DataFrame: - return df.select( - pl.col("a").min(), - pl.col("b").max(), - ) - - expected = pl.DataFrame( - [ - pl.Series("a", [1, 1, 2, 3, 4], dtype=pl.Int64), - pl.Series("b", [1, 2, 3, 4, 5], dtype=pl.Int64), - ] - ) - - out = df.group_by_rolling("a", period="2i").apply(apply, schema=df.schema) - assert_frame_equal(out, expected) - - def test_rolling_group_by_overlapping_groups() -> None: # this first aggregates overlapping groups so they cannot be naively flattened df = pl.DataFrame({"a": [41, 60, 37, 51, 52, 39, 40]}) diff --git a/py-polars/tests/unit/test_empty.py b/py-polars/tests/unit/test_empty.py index 6f3f47addbad..6af7ebc4483e 100644 --- a/py-polars/tests/unit/test_empty.py +++ b/py-polars/tests/unit/test_empty.py @@ -72,14 +72,6 @@ def test_empty_9137() -> None: assert out.dtypes == [pl.Float32, pl.Float32] -def test_empty_group_by_apply_err() -> None: - df = pl.DataFrame(schema={"x": pl.Int64}) - with pytest.raises( - pl.ComputeError, match=r"cannot group_by \+ apply on empty 'DataFrame'" - ): - df.group_by("x").apply(lambda x: x) - - def test_empty_list_namespace_output_9585() -> None: dtype = pl.List(pl.Utf8) names = ["sort", "unique", "head", "tail", "shift", "reverse"] diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index bd8a276698a8..2fd9e26c5849 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -166,17 +166,6 @@ def test_or() -> None: assert out.rows() == [(1, 1.0), (3, 3.0)] -def test_group_by_apply() -> None: - ldf = ( - pl.LazyFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 3.0]}) - .group_by("a") - .apply(lambda df: df * 2.0, schema={"a": pl.Float64, "b": pl.Float64}) - ) - out = ldf.collect() - assert out.schema == ldf.schema - assert out.shape == (3, 2) - - def test_filter_str() -> None: # use a str instead of a column expr ldf = pl.LazyFrame(