Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: dropping nuisance columns in DataFrameGroupby apply, agg, transform #41475

Merged
merged 9 commits into from
May 26, 2021
2 changes: 2 additions & 0 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1000,6 +1000,7 @@ instance method on each data group. This is pretty easy to do by passing lambda
functions:

.. ipython:: python
:okwarning:

grouped = df.groupby("A")
grouped.agg(lambda x: x.std())
Expand All @@ -1009,6 +1010,7 @@ arguments. Using a bit of metaprogramming cleverness, GroupBy now has the
ability to "dispatch" method calls to the groups:

.. ipython:: python
:okwarning:

grouped.std()

Expand Down
38 changes: 38 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,44 @@ For example:
A 24
dtype: int64


Similarly, when applying a function to :class:`DataFrameGroupBy`, columns on which
the function raises ``TypeError`` are currently silently ignored and dropped
from the result.

This behavior is deprecated. In a future version, the ``TypeError``
will be raised, and users will need to select only valid columns before calling
the function.

For example:

.. ipython:: python

df = pd.DataFrame({"A": [1, 2, 3, 4], "B": pd.date_range("2016-01-01", periods=4)})
gb = df.groupby([1, 1, 2, 2])

*Old behavior*:

.. code-block:: ipython

In [4]: gb.prod(numeric_only=False)
Out[4]:
A
1 2
2 12

.. code-block:: ipython

In [5]: gb.prod(numeric_only=False)
...
TypeError: datetime64 type does not support prod operations

In [6]: gb[["A"]].prod(numeric_only=False)
Out[6]:
A
1 2
2 12

.. ---------------------------------------------------------------------------


Expand Down
28 changes: 27 additions & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,15 @@ def array_func(values: ArrayLike) -> ArrayLike:
if not len(new_mgr) and len(orig):
# If the original Manager was already empty, no need to raise
raise DataError("No numeric types to aggregate")
if len(new_mgr) < len(data):
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.{how} "
"is deprecated. In a future version, a TypeError will be raised. "
f"Before calling .{how}, select only columns which should be "
"valid for the function.",
FutureWarning,
stacklevel=4,
)

return self._wrap_agged_manager(new_mgr)

Expand Down Expand Up @@ -1283,6 +1292,16 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
res_mgr.set_axis(1, mgr.axes[1])

if len(res_mgr) < len(mgr):
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.{how} "
"is deprecated. In a future version, a TypeError will be raised. "
f"Before calling .{how}, select only columns which should be "
"valid for the transforming function.",
FutureWarning,
stacklevel=4,
)

res_df = self.obj._constructor(res_mgr)
if self.axis == 1:
res_df = res_df.T
Expand Down Expand Up @@ -1420,7 +1439,14 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
output[i] = sgb.transform(wrapper)
except TypeError:
# e.g. trying to call nanmean with string values
pass
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.transform "
"is deprecated. In a future version, a TypeError will be raised. "
"Before calling .transform, select only columns which should be "
"valid for the transforming function.",
FutureWarning,
stacklevel=5,
)
else:
inds.append(i)

Expand Down
19 changes: 19 additions & 0 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class providing the base-class of operations.
Union,
cast,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -1270,6 +1271,14 @@ def _python_agg_general(self, func, *args, **kwargs):
# if this function is invalid for this dtype, we will ignore it.
result = self.grouper.agg_series(obj, f)
except TypeError:
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.agg "
"is deprecated. In a future version, a TypeError will be raised. "
"Before calling .agg, select only columns which should be "
"valid for the aggregating function.",
FutureWarning,
stacklevel=3,
)
continue

key = base.OutputKey(label=name, position=idx)
Expand Down Expand Up @@ -2829,6 +2838,16 @@ def _get_cythonized_result(
vals, inferences = pre_processing(vals)
except TypeError as err:
error_msg = str(err)
howstr = how.replace("group_", "")
warnings.warn(
"Dropping invalid columns in "
f"{type(self).__name__}.{howstr} is deprecated. "
"In a future version, a TypeError will be raised. "
f"Before calling .{howstr}, select only columns which "
"should be valid for the function.",
FutureWarning,
stacklevel=3,
)
continue
vals = vals.astype(cython_dtype, copy=False)
if needs_2d:
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,8 @@ def func(ser):
else:
return ser.sum()

result = grouped.aggregate(func)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
result = grouped.aggregate(func)
exp_grouped = three_group.loc[:, three_group.columns != "C"]
expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -1020,6 +1021,7 @@ def test_mangle_series_groupby(self):
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
def test_with_kwargs(self):
f1 = lambda x, y, b=1: x.sum() + y + b
f2 = lambda x, y, b=2: x.sum() + y * b
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,16 @@ def test_agg_api():
def peak_to_peak(arr):
return arr.max() - arr.min()

expected = grouped.agg([peak_to_peak])
with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid", check_stacklevel=False
):
expected = grouped.agg([peak_to_peak])
expected.columns = ["data1", "data2"]
result = grouped.agg(peak_to_peak)

with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid", check_stacklevel=False
):
result = grouped.agg(peak_to_peak)
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -294,7 +301,8 @@ def raiseException(df):
raise TypeError("test")

with pytest.raises(TypeError, match="test"):
df.groupby(0).agg(raiseException)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
df.groupby(0).agg(raiseException)


def test_series_agg_multikey():
Expand Down
32 changes: 27 additions & 5 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,15 @@ def test_max_min_object_multiple_columns(using_array_manager):

gb = df.groupby("A")

result = gb.max(numeric_only=False)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = gb.max(numeric_only=False)
# "max" is valid for column "C" but not for "B"
ei = Index([1, 2, 3], name="A")
expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
tm.assert_frame_equal(result, expected)

result = gb.min(numeric_only=False)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = gb.min(numeric_only=False)
# "min" is valid for column "C" but not for "B"
ei = Index([1, 2, 3], name="A")
expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
Expand Down Expand Up @@ -221,7 +223,10 @@ def test_averages(self, df, method):
],
)

result = getattr(gb, method)(numeric_only=False)
with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid", check_stacklevel=False
):
result = getattr(gb, method)(numeric_only=False)
tm.assert_frame_equal(result.reindex_like(expected), expected)

expected_columns = expected.columns
Expand Down Expand Up @@ -303,10 +308,27 @@ def test_cummin_cummax(self, df, method):
def _check(self, df, method, expected_columns, expected_columns_numeric):
gb = df.groupby("group")

result = getattr(gb, method)()
# cummin, cummax dont have numeric_only kwarg, always use False
warn = None
if method in ["cummin", "cummax"]:
# these dont have numeric_only kwarg, always use False
warn = FutureWarning
elif method in ["min", "max"]:
# these have numeric_only kwarg, but default to False
warn = FutureWarning

with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = getattr(gb, method)()

tm.assert_index_equal(result.columns, expected_columns_numeric)

result = getattr(gb, method)(numeric_only=False)
# GH#41475 deprecated silently ignoring nuisance columns
warn = None
if len(expected_columns) < len(gb._obj_with_exclusions.columns):
warn = FutureWarning
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = getattr(gb, method)(numeric_only=False)

tm.assert_index_equal(result.columns, expected_columns)


Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,7 +923,8 @@ def aggfun(ser):
else:
return ser.sum()

agged2 = df.groupby(keys).aggregate(aggfun)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
agged2 = df.groupby(keys).aggregate(aggfun)
assert len(agged2.columns) + 1 == len(df.columns)


Expand Down Expand Up @@ -1757,6 +1758,7 @@ def test_pivot_table_values_key_error():
@pytest.mark.parametrize(
"op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"]
)
@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_empty_groupby(columns, keys, values, method, op, request):
# GH8093 & GH26411
Expand Down
11 changes: 9 additions & 2 deletions pandas/tests/groupby/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,10 @@ def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])

with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
df.groupby("key").quantile()
with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid columns"
):
df.groupby("key").quantile()


def test_quantile_out_of_bounds_q_raises():
Expand Down Expand Up @@ -236,7 +239,11 @@ def test_groupby_quantile_nullable_array(values, q):
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
def test_groupby_quantile_skips_invalid_dtype(q):
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
result = df.groupby("a").quantile(q)

warn = None if isinstance(q, list) else FutureWarning
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = df.groupby("a").quantile(q)

expected = df.groupby("a")[["b"]].quantile(q)
tm.assert_frame_equal(result, expected)

Expand Down
Loading