Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: dropping nuisance columns in DataFrameGroupby apply, agg, transform #41475

Merged
merged 9 commits into from
May 26, 2021
2 changes: 2 additions & 0 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1000,6 +1000,7 @@ instance method on each data group. This is pretty easy to do by passing lambda
functions:

.. ipython:: python
:okwarning:

grouped = df.groupby("A")
grouped.agg(lambda x: x.std())
Expand All @@ -1009,6 +1010,7 @@ arguments. Using a bit of metaprogramming cleverness, GroupBy now has the
ability to "dispatch" method calls to the groups:

.. ipython:: python
:okwarning:

grouped.std()

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,7 @@ Deprecations
- Deprecated setting :attr:`Categorical._codes`, create a new :class:`Categorical` with the desired codes instead (:issue:`40606`)
- Deprecated behavior of :meth:`DatetimeIndex.union` with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`)
- Deprecated using ``usecols`` with out of bounds indices for ``read_csv`` with ``engine="c"`` (:issue:`25623`)
- Deprecated ignoring "nuisance columns" in :meth:`DataFrameGroupBy.agg`, :meth:`DataFrameGroupBy.apply`, and :meth:`DataFrameGroupBy.transform`; select valid columns before calling the method (:issue:`41475`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make this a sub-section (ok to lump this in with the DataFrame reductions PR sub-section as well), e.g. just a hey we are now deprecating both of these simliar things, with an example.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

e.g. #41480

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated + green


.. ---------------------------------------------------------------------------

Expand Down
9 changes: 8 additions & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1428,7 +1428,14 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
output[i] = sgb.transform(wrapper)
except TypeError:
# e.g. trying to call nanmean with string values
pass
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.transform "
"is deprecated. In a future version, a TypeError will be raised. "
"Before calling .transform, select only columns which should be "
"valid for the transforming function.",
FutureWarning,
stacklevel=5,
)
else:
inds.append(i)

Expand Down
19 changes: 19 additions & 0 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class providing the base-class of operations.
Union,
cast,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -1269,6 +1270,14 @@ def _python_agg_general(self, func, *args, **kwargs):
# if this function is invalid for this dtype, we will ignore it.
result = self.grouper.agg_series(obj, f)
except TypeError:
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.agg "
"is deprecated. In a future version, a TypeError will be raised. "
"Before calling .agg, select only columns which should be "
"valid for the aggregating function.",
FutureWarning,
stacklevel=3,
)
continue

key = base.OutputKey(label=name, position=idx)
Expand Down Expand Up @@ -2825,6 +2834,16 @@ def _get_cythonized_result(
vals, inferences = pre_processing(vals)
except TypeError as err:
error_msg = str(err)
howstr = how.replace("group_", "")
warnings.warn(
"Dropping invalid columns in "
f"{type(self).__name__}.{howstr} is deprecated. "
"In a future version, a TypeError will be raised. "
f"Before calling .{howstr}, select only columns which "
"should be valid for the function.",
FutureWarning,
stacklevel=3,
)
continue
vals = vals.astype(cython_dtype, copy=False)
if needs_2d:
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,8 @@ def func(ser):
else:
return ser.sum()

result = grouped.aggregate(func)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
result = grouped.aggregate(func)
exp_grouped = three_group.loc[:, three_group.columns != "C"]
expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -1018,6 +1019,7 @@ def test_mangle_series_groupby(self):
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
def test_with_kwargs(self):
f1 = lambda x, y, b=1: x.sum() + y + b
f2 = lambda x, y, b=2: x.sum() + y * b
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,16 @@ def test_agg_api():
def peak_to_peak(arr):
return arr.max() - arr.min()

expected = grouped.agg([peak_to_peak])
with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid", check_stacklevel=False
):
expected = grouped.agg([peak_to_peak])
expected.columns = ["data1", "data2"]
result = grouped.agg(peak_to_peak)

with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid", check_stacklevel=False
):
result = grouped.agg(peak_to_peak)
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -294,7 +301,8 @@ def raiseException(df):
raise TypeError("test")

with pytest.raises(TypeError, match="test"):
df.groupby(0).agg(raiseException)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
df.groupby(0).agg(raiseException)


def test_series_agg_multikey():
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,7 +923,8 @@ def aggfun(ser):
else:
return ser.sum()

agged2 = df.groupby(keys).aggregate(aggfun)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
agged2 = df.groupby(keys).aggregate(aggfun)
assert len(agged2.columns) + 1 == len(df.columns)


Expand Down Expand Up @@ -1757,6 +1758,7 @@ def test_pivot_table_values_key_error():
@pytest.mark.parametrize(
"op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"]
)
@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
def test_empty_groupby(columns, keys, values, method, op, request):
# GH8093 & GH26411
override_dtype = None
Expand Down
11 changes: 9 additions & 2 deletions pandas/tests/groupby/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,10 @@ def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])

with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
df.groupby("key").quantile()
with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid columns"
):
df.groupby("key").quantile()


def test_quantile_out_of_bounds_q_raises():
Expand Down Expand Up @@ -236,7 +239,11 @@ def test_groupby_quantile_nullable_array(values, q):
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
def test_groupby_quantile_skips_invalid_dtype(q):
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
result = df.groupby("a").quantile(q)

warn = None if isinstance(q, list) else FutureWarning
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = df.groupby("a").quantile(q)

expected = df.groupby("a")[["b"]].quantile(q)
tm.assert_frame_equal(result, expected)

Expand Down
43 changes: 33 additions & 10 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,9 @@ def test_transform_exclude_nuisance(df, duplicates):
grouped = df.groupby("A")

gbc = grouped["C"]
expected["C"] = gbc.transform(np.mean)
warn = FutureWarning if duplicates else None
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
expected["C"] = gbc.transform(np.mean)
if duplicates:
# squeeze 1-column DataFrame down to Series
expected["C"] = expected["C"]["C"]
Expand All @@ -422,14 +424,16 @@ def test_transform_exclude_nuisance(df, duplicates):

expected["D"] = grouped["D"].transform(np.mean)
expected = DataFrame(expected)
result = df.groupby("A").transform(np.mean)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
result = df.groupby("A").transform(np.mean)

tm.assert_frame_equal(result, expected)


def test_transform_function_aliases(df):
result = df.groupby("A").transform("mean")
expected = df.groupby("A").transform(np.mean)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
result = df.groupby("A").transform("mean")
expected = df.groupby("A").transform(np.mean)
tm.assert_frame_equal(result, expected)

result = df.groupby("A")["C"].transform("mean")
Expand Down Expand Up @@ -498,7 +502,10 @@ def test_groupby_transform_with_int():
}
)
with np.errstate(all="ignore"):
result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid columns"
):
result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
expected = DataFrame(
{"B": np.nan, "C": Series([-1, 0, 1, -1, 0, 1], dtype="float64")}
)
Expand All @@ -514,15 +521,21 @@ def test_groupby_transform_with_int():
}
)
with np.errstate(all="ignore"):
result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid columns"
):
result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
expected = DataFrame({"B": np.nan, "C": [-1.0, 0.0, 1.0, -1.0, 0.0, 1.0]})
tm.assert_frame_equal(result, expected)

# int that needs float conversion
s = Series([2, 3, 4, 10, 5, -1])
df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"})
with np.errstate(all="ignore"):
result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid columns"
):
result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())

s1 = s.iloc[0:3]
s1 = (s1 - s1.mean()) / s1.std()
Expand All @@ -532,7 +545,8 @@ def test_groupby_transform_with_int():
tm.assert_frame_equal(result, expected)

# int doesn't get downcasted
result = df.groupby("A").transform(lambda x: x * 2 / 2)
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
result = df.groupby("A").transform(lambda x: x * 2 / 2)
expected = DataFrame({"B": 1.0, "C": [2.0, 3.0, 4.0, 10.0, 5.0, -1.0]})
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -791,7 +805,11 @@ def test_transform_numeric_ret(cols, exp, comp_func, agg_func, request):
{"a": date_range("2018-01-01", periods=3), "b": range(3), "c": range(7, 10)}
)

result = df.groupby("b")[cols].transform(agg_func)
warn = FutureWarning
if isinstance(exp, Series) or agg_func != "size":
warn = None
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = df.groupby("b")[cols].transform(agg_func)

if agg_func == "rank":
exp = exp.astype("float")
Expand Down Expand Up @@ -1103,7 +1121,12 @@ def test_transform_agg_by_name(request, reduction_func, obj):

args = {"nth": [0], "quantile": [0.5], "corrwith": [obj]}.get(func, [])

result = g.transform(func, *args)
warn = None
if isinstance(obj, DataFrame) and func == "size":
warn = FutureWarning

with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = g.transform(func, *args)

# this is the *definition* of a transformation
tm.assert_index_equal(result.index, obj.index)
Expand Down