Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: groupby nuisance warnings #46010

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import numpy as np

from pandas._libs import reduction as libreduction
import pandas._libs.lib as lib
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can include in previous line

from pandas._typing import (
ArrayLike,
Manager,
Expand Down Expand Up @@ -1102,9 +1103,14 @@ def _wrap_applied_output_series(
return self._reindex_output(result)

def _cython_transform(
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
self,
how: str,
numeric_only: bool | lib.NoDefault = lib.no_default,
axis: int = 0,
**kwargs,
) -> DataFrame:
assert axis == 0 # handled by caller
numeric_only_bool = self._resolve_numeric_only(numeric_only)
# TODO: no tests with self.ndim == 1 for DataFrameGroupBy

# With self.axis == 0, we have multi-block tests
Expand All @@ -1113,7 +1119,8 @@ def _cython_transform(
# With self.axis == 1, _get_data_to_aggregate does a transpose
# so we always have a single block.
mgr: Manager2D = self._get_data_to_aggregate()
if numeric_only:
orig_len = len(mgr)
if numeric_only_bool:
mgr = mgr.get_numeric_data(copy=False)

def arr_func(bvalues: ArrayLike) -> ArrayLike:
Expand All @@ -1126,7 +1133,9 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
res_mgr.set_axis(1, mgr.axes[1])

if len(res_mgr) < len(mgr):
if len(res_mgr) < len(mgr) or (
numeric_only is lib.no_default and len(res_mgr) < orig_len
):
warn_dropping_nuisance_columns_deprecated(type(self), how)

res_df = self.obj._constructor(res_mgr)
Expand Down
40 changes: 25 additions & 15 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1494,7 +1494,7 @@ def _python_agg_general(self, func, *args, **kwargs):
@final
def _agg_general(
self,
numeric_only: bool = True,
numeric_only: bool | lib.NoDefault = True,
min_count: int = -1,
*,
alias: str,
Expand Down Expand Up @@ -1553,15 +1553,21 @@ def _agg_py_fallback(

@final
def _cython_agg_general(
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
self,
how: str,
alt: Callable,
numeric_only: bool | lib.NoDefault,
min_count: int = -1,
):
# Note: we never get here with how="ohlc" for DataFrameGroupBy;
# that goes through SeriesGroupBy

data = self._get_data_to_aggregate()
orig_len = len(data)
is_ser = data.ndim == 1
numeric_only_bool = self._resolve_numeric_only(numeric_only)

if numeric_only:
if numeric_only_bool:
if is_ser and not is_numeric_dtype(self._selected_obj.dtype):
# GH#41291 match Series behavior
kwd_name = "numeric_only"
Expand Down Expand Up @@ -1591,7 +1597,10 @@ def array_func(values: ArrayLike) -> ArrayLike:
# continue and exclude the block
new_mgr = data.grouped_reduce(array_func, ignore_failures=True)

if not is_ser and len(new_mgr) < len(data):
if not is_ser and (
len(new_mgr) < len(data)
or (numeric_only is lib.no_default and len(new_mgr) < orig_len)
):
warn_dropping_nuisance_columns_deprecated(type(self), how)

res = self._wrap_agged_manager(new_mgr)
Expand Down Expand Up @@ -1947,7 +1956,6 @@ def mean(
Name: B, dtype: float64
"""
numeric_only_bool = self._resolve_numeric_only(numeric_only)

if maybe_use_numba(engine):
from pandas.core._numba.kernels import sliding_mean

Expand All @@ -1956,7 +1964,7 @@ def mean(
result = self._cython_agg_general(
"mean",
alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool),
numeric_only=numeric_only_bool,
numeric_only=numeric_only,
)
return result.__finalize__(self.obj, method="groupby")

Expand All @@ -1981,11 +1989,10 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
Median of values within each group.
"""
numeric_only_bool = self._resolve_numeric_only(numeric_only)

result = self._cython_agg_general(
"median",
alt=lambda x: Series(x).median(numeric_only=numeric_only_bool),
numeric_only=numeric_only_bool,
numeric_only=numeric_only,
)
return result.__finalize__(self.obj, method="groupby")

Expand Down Expand Up @@ -2180,8 +2187,6 @@ def sum(
"groupby_sum",
)
else:
numeric_only = self._resolve_numeric_only(numeric_only)

# If we are grouping on categoricals we want unobserved categories to
# return zero, rather than the default of NaN which the reindexing in
# _agg_general() returns. GH #31422
Expand All @@ -2200,8 +2205,6 @@ def sum(
def prod(
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
):
numeric_only = self._resolve_numeric_only(numeric_only)

return self._agg_general(
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
)
Expand Down Expand Up @@ -3343,7 +3346,7 @@ def _get_cythonized_result(
-------
`Series` or `DataFrame` with filled values
"""
numeric_only = self._resolve_numeric_only(numeric_only)
numeric_only_bool = self._resolve_numeric_only(numeric_only)

if post_processing and not callable(post_processing):
raise ValueError("'post_processing' must be a callable!")
Expand Down Expand Up @@ -3412,13 +3415,17 @@ def blk_func(values: ArrayLike) -> ArrayLike:
# Operate block-wise instead of column-by-column
is_ser = obj.ndim == 1
mgr = self._get_data_to_aggregate()
orig_len = len(mgr.items)

if numeric_only:
if numeric_only_bool:
mgr = mgr.get_numeric_data()

res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)

if not is_ser and len(res_mgr.items) != len(mgr.items):
if not is_ser and (
len(res_mgr.items) < len(mgr.items)
or (numeric_only is lib.no_default and len(res_mgr.items) < orig_len)
):
howstr = how.replace("group_", "")
warn_dropping_nuisance_columns_deprecated(type(self), howstr)

Expand Down Expand Up @@ -3922,6 +3929,9 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde


def warn_dropping_nuisance_columns_deprecated(cls, how: str) -> None:
if how == "add":
# groupby internally uses "add" instead of "sum" in some places
how = "sum"
warnings.warn(
"Dropping invalid columns in "
f"{cls.__name__}.{how} is deprecated. "
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ def test_in_numeric_groupby(self, data_for_grouping):
"C": [1, 1, 1, 1, 1, 1, 1, 1],
}
)
result = df.groupby("A").sum().columns
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = df.groupby("A").sum().columns

if data_for_grouping.dtype._is_numeric:
expected = pd.Index(["B", "C"])
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1777,7 +1777,8 @@ def test_stack_multiple_bug(self):
multi = df.set_index(["DATE", "ID"])
multi.columns.name = "Params"
unst = multi.unstack("ID")
down = unst.resample("W-THU").mean()
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
down = unst.resample("W-THU").mean()

rs = down.stack("ID")
xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID")
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/generic/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ def test_metadata_propagation_indiv_groupby(self):
"D": np.random.randn(8),
}
)
result = df.groupby("A").sum()
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = df.groupby("A").sum()
tm.assert_metadata_equivalent(df, result)

def test_metadata_propagation_indiv_resample(self):
Expand Down
13 changes: 9 additions & 4 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ def test_basic(): # TODO: split this test
gb = df.groupby("A", observed=False)
exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
result = gb.sum()
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = gb.sum()
tm.assert_frame_equal(result, expected)

# GH 8623
Expand Down Expand Up @@ -344,7 +345,8 @@ def test_observed(observed):
gb = df.groupby(["A", "B"], observed=observed)
exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
result = gb.sum()
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
expected, [cat1, cat2], list("AB"), fill_value=0
Expand Down Expand Up @@ -807,8 +809,11 @@ def test_preserve_categorical_dtype():
}
)
for col in ["C1", "C2"]:
result1 = df.groupby(by=col, as_index=False, observed=False).mean()
result2 = df.groupby(by=col, as_index=True, observed=False).mean().reset_index()
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
result1 = df.groupby(by=col, as_index=False, observed=False).mean()
result2 = (
df.groupby(by=col, as_index=True, observed=False).mean().reset_index()
)
expected = exp_full.reindex(columns=result1.columns)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
elif method in ["min", "max"]:
# these have numeric_only kwarg, but default to False
warn = FutureWarning
elif method in ["mean", "median", "prod", "cumprod", "sum", "cumsum"]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment here similar to L256, L259?

warn = FutureWarning

with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = getattr(gb, method)()
Expand Down
Loading