Skip to content

Commit

Permalink
DEPR: dropping nuisance columns in DataFrameGroupby apply, agg, trans…
Browse files Browse the repository at this point in the history
…form (#41475)
  • Loading branch information
jbrockmendel authored May 26, 2021
1 parent 2f7be5e commit f373bba
Show file tree
Hide file tree
Showing 10 changed files with 172 additions and 23 deletions.
2 changes: 2 additions & 0 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1000,6 +1000,7 @@ instance method on each data group. This is pretty easy to do by passing lambda
functions:

.. ipython:: python
:okwarning:
grouped = df.groupby("A")
grouped.agg(lambda x: x.std())
Expand All @@ -1009,6 +1010,7 @@ arguments. Using a bit of metaprogramming cleverness, GroupBy now has the
ability to "dispatch" method calls to the groups:

.. ipython:: python
:okwarning:
grouped.std()
Expand Down
38 changes: 38 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -726,6 +726,44 @@ For example:
A 24
dtype: int64
Similarly, when applying a function to :class:`DataFrameGroupBy`, columns on which
the function raises ``TypeError`` are currently silently ignored and dropped
from the result.

This behavior is deprecated. In a future version, the ``TypeError``
will be raised, and users will need to select only valid columns before calling
the function.

For example:

.. ipython:: python
df = pd.DataFrame({"A": [1, 2, 3, 4], "B": pd.date_range("2016-01-01", periods=4)})
gb = df.groupby([1, 1, 2, 2])
*Old behavior*:

.. code-block:: ipython
In [4]: gb.prod(numeric_only=False)
Out[4]:
A
1 2
2 12
.. code-block:: ipython
In [5]: gb.prod(numeric_only=False)
...
TypeError: datetime64 type does not support prod operations
In [6]: gb[["A"]].prod(numeric_only=False)
Out[6]:
A
1 2
2 12
.. ---------------------------------------------------------------------------
Expand Down
28 changes: 27 additions & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,15 @@ def array_func(values: ArrayLike) -> ArrayLike:
if not len(new_mgr) and len(orig):
# If the original Manager was already empty, no need to raise
raise DataError("No numeric types to aggregate")
if len(new_mgr) < len(data):
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.{how} "
"is deprecated. In a future version, a TypeError will be raised. "
f"Before calling .{how}, select only columns which should be "
"valid for the function.",
FutureWarning,
stacklevel=4,
)

return self._wrap_agged_manager(new_mgr)

Expand Down Expand Up @@ -1283,6 +1292,16 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
res_mgr.set_axis(1, mgr.axes[1])

if len(res_mgr) < len(mgr):
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.{how} "
"is deprecated. In a future version, a TypeError will be raised. "
f"Before calling .{how}, select only columns which should be "
"valid for the transforming function.",
FutureWarning,
stacklevel=4,
)

res_df = self.obj._constructor(res_mgr)
if self.axis == 1:
res_df = res_df.T
Expand Down Expand Up @@ -1420,7 +1439,14 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
output[i] = sgb.transform(wrapper)
except TypeError:
# e.g. trying to call nanmean with string values
pass
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.transform "
"is deprecated. In a future version, a TypeError will be raised. "
"Before calling .transform, select only columns which should be "
"valid for the transforming function.",
FutureWarning,
stacklevel=5,
)
else:
inds.append(i)

Expand Down
19 changes: 19 additions & 0 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class providing the base-class of operations.
Union,
cast,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -1280,6 +1281,14 @@ def _python_agg_general(self, func, *args, **kwargs):
# if this function is invalid for this dtype, we will ignore it.
result = self.grouper.agg_series(obj, f)
except TypeError:
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.agg "
"is deprecated. In a future version, a TypeError will be raised. "
"Before calling .agg, select only columns which should be "
"valid for the aggregating function.",
FutureWarning,
stacklevel=3,
)
continue

key = base.OutputKey(label=name, position=idx)
Expand Down Expand Up @@ -2839,6 +2848,16 @@ def _get_cythonized_result(
vals, inferences = pre_processing(vals)
except TypeError as err:
error_msg = str(err)
howstr = how.replace("group_", "")
warnings.warn(
"Dropping invalid columns in "
f"{type(self).__name__}.{howstr} is deprecated. "
"In a future version, a TypeError will be raised. "
f"Before calling .{howstr}, select only columns which "
"should be valid for the function.",
FutureWarning,
stacklevel=3,
)
continue
vals = vals.astype(cython_dtype, copy=False)
if needs_2d:
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,8 @@ def func(ser):
else:
return ser.sum()

result = grouped.aggregate(func)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
result = grouped.aggregate(func)
exp_grouped = three_group.loc[:, three_group.columns != "C"]
expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -1020,6 +1021,7 @@ def test_mangle_series_groupby(self):
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
def test_with_kwargs(self):
f1 = lambda x, y, b=1: x.sum() + y + b
f2 = lambda x, y, b=2: x.sum() + y * b
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,16 @@ def test_agg_api():
def peak_to_peak(arr):
return arr.max() - arr.min()

expected = grouped.agg([peak_to_peak])
with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid", check_stacklevel=False
):
expected = grouped.agg([peak_to_peak])
expected.columns = ["data1", "data2"]
result = grouped.agg(peak_to_peak)

with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid", check_stacklevel=False
):
result = grouped.agg(peak_to_peak)
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -294,7 +301,8 @@ def raiseException(df):
raise TypeError("test")

with pytest.raises(TypeError, match="test"):
df.groupby(0).agg(raiseException)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
df.groupby(0).agg(raiseException)


def test_series_agg_multikey():
Expand Down
32 changes: 27 additions & 5 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,15 @@ def test_max_min_object_multiple_columns(using_array_manager):

gb = df.groupby("A")

result = gb.max(numeric_only=False)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = gb.max(numeric_only=False)
# "max" is valid for column "C" but not for "B"
ei = Index([1, 2, 3], name="A")
expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
tm.assert_frame_equal(result, expected)

result = gb.min(numeric_only=False)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = gb.min(numeric_only=False)
# "min" is valid for column "C" but not for "B"
ei = Index([1, 2, 3], name="A")
expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
Expand Down Expand Up @@ -221,7 +223,10 @@ def test_averages(self, df, method):
],
)

result = getattr(gb, method)(numeric_only=False)
with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid", check_stacklevel=False
):
result = getattr(gb, method)(numeric_only=False)
tm.assert_frame_equal(result.reindex_like(expected), expected)

expected_columns = expected.columns
Expand Down Expand Up @@ -303,10 +308,27 @@ def test_cummin_cummax(self, df, method):
def _check(self, df, method, expected_columns, expected_columns_numeric):
gb = df.groupby("group")

result = getattr(gb, method)()
# cummin, cummax dont have numeric_only kwarg, always use False
warn = None
if method in ["cummin", "cummax"]:
# these dont have numeric_only kwarg, always use False
warn = FutureWarning
elif method in ["min", "max"]:
# these have numeric_only kwarg, but default to False
warn = FutureWarning

with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = getattr(gb, method)()

tm.assert_index_equal(result.columns, expected_columns_numeric)

result = getattr(gb, method)(numeric_only=False)
# GH#41475 deprecated silently ignoring nuisance columns
warn = None
if len(expected_columns) < len(gb._obj_with_exclusions.columns):
warn = FutureWarning
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = getattr(gb, method)(numeric_only=False)

tm.assert_index_equal(result.columns, expected_columns)


Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,7 +923,8 @@ def aggfun(ser):
else:
return ser.sum()

agged2 = df.groupby(keys).aggregate(aggfun)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
agged2 = df.groupby(keys).aggregate(aggfun)
assert len(agged2.columns) + 1 == len(df.columns)


Expand Down Expand Up @@ -1757,6 +1758,7 @@ def test_pivot_table_values_key_error():
@pytest.mark.parametrize(
"op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"]
)
@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_empty_groupby(columns, keys, values, method, op, request):
# GH8093 & GH26411
Expand Down
11 changes: 9 additions & 2 deletions pandas/tests/groupby/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,10 @@ def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])

with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
df.groupby("key").quantile()
with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid columns"
):
df.groupby("key").quantile()


def test_quantile_out_of_bounds_q_raises():
Expand Down Expand Up @@ -236,7 +239,11 @@ def test_groupby_quantile_nullable_array(values, q):
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
def test_groupby_quantile_skips_invalid_dtype(q):
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
result = df.groupby("a").quantile(q)

warn = None if isinstance(q, list) else FutureWarning
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = df.groupby("a").quantile(q)

expected = df.groupby("a")[["b"]].quantile(q)
tm.assert_frame_equal(result, expected)

Expand Down
Loading

0 comments on commit f373bba

Please sign in to comment.