Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REGR: NumPy func warning when dropping nuisance in agg, apply, transform #50627

Merged
merged 2 commits into from
Jan 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Fixed regressions
- Enforced reversion of ``color`` as an alias for ``c`` and ``size`` as an alias for ``s`` in function :meth:`DataFrame.plot.scatter` (:issue:`49732`)
- Fixed regression in :meth:`SeriesGroupBy.apply` setting a ``name`` attribute on the result if the result was a :class:`DataFrame` (:issue:`49907`)
- Fixed performance regression in setting with the :meth:`~DataFrame.at` indexer (:issue:`49771`)
- Fixed regression in the methods ``apply``, ``agg``, and ``transform`` when used with NumPy functions that informed users to supply ``numeric_only=True`` if the operation failed on non-numeric dtypes; such columns must be dropped prior to using these methods (:issue:`50538`)
-

.. ---------------------------------------------------------------------------
Expand Down
24 changes: 21 additions & 3 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@
SpecificationError,
)
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level
from pandas.util._exceptions import (
find_stack_level,
rewrite_warning,
)

from pandas.core.dtypes.cast import is_nested_object
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -174,7 +177,15 @@ def agg(self) -> DataFrame | Series | None:
if callable(arg):
f = com.get_cython_func(arg)
if f and not args and not kwargs:
return getattr(obj, f)()
# GH#50538
old_msg = "The default value of numeric_only"
new_msg = (
f"The operation {arg} failed on a column. If any error is "
f"raised, this will raise an exception in a future version "
f"of pandas. Drop these columns to avoid this warning."
)
with rewrite_warning(old_msg, FutureWarning, new_msg):
return getattr(obj, f)()

# caller can react
return None
Expand Down Expand Up @@ -309,7 +320,14 @@ def transform_str_or_callable(self, func) -> DataFrame | Series:
if not args and not kwargs:
f = com.get_cython_func(func)
if f:
return getattr(obj, f)()
old_msg = "The default value of numeric_only"
new_msg = (
f"The operation {func} failed on a column. If any error is "
f"raised, this will raise an exception in a future version "
f"of pandas. Drop these columns to avoid this warning."
)
with rewrite_warning(old_msg, FutureWarning, new_msg):
return getattr(obj, f)()

# Two possible ways to use a UDF - apply or call directly
try:
Expand Down
38 changes: 33 additions & 5 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ class providing the base-class of operations.
"""
from __future__ import annotations

from contextlib import contextmanager
from contextlib import (
contextmanager,
nullcontext,
)
import datetime
from functools import (
partial,
Expand Down Expand Up @@ -64,7 +67,10 @@ class providing the base-class of operations.
cache_readonly,
doc,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._exceptions import (
find_stack_level,
rewrite_warning,
)

from pandas.core.dtypes.cast import ensure_dtype_can_hold_na
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -1508,7 +1514,9 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs)
)
)
def apply(self, func, *args, **kwargs) -> NDFrameT:

# GH#50538
is_np_func = func in com._cython_table and func not in com._builtin_table
orig_func = func
func = com.is_builtin_func(func)

if isinstance(func, str):
Expand Down Expand Up @@ -1546,7 +1554,17 @@ def f(g):
# ignore SettingWithCopy here in case the user mutates
with option_context("mode.chained_assignment", None):
try:
result = self._python_apply_general(f, self._selected_obj)
# GH#50538
old_msg = "The default value of numeric_only"
new_msg = (
f"The operation {orig_func} failed on a column. If any error is "
f"raised, this will raise an exception in a future version "
f"of pandas. Drop these columns to avoid this warning."
)
with rewrite_warning(
old_msg, FutureWarning, new_msg
) if is_np_func else nullcontext():
result = self._python_apply_general(f, self._selected_obj)
except TypeError:
# gh-20949
# try again, with .apply acting as a filtering
Expand All @@ -1557,7 +1575,17 @@ def f(g):
# on a string grouper column

with self._group_selection_context():
return self._python_apply_general(f, self._selected_obj)
# GH#50538
old_msg = "The default value of numeric_only"
new_msg = (
f"The operation {orig_func} failed on a column. If any error "
f"is raised, this will raise an exception in a future version "
f"of pandas. Drop these columns to avoid this warning."
)
with rewrite_warning(
old_msg, FutureWarning, new_msg
) if is_np_func else nullcontext():
return self._python_apply_general(f, self._selected_obj)

return result

Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1287,6 +1287,27 @@ def test_nuiscance_columns():
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("method", ["agg", "apply", "transform"])
def test_numeric_only_warning_numpy(method):
# GH#50538
df = DataFrame({"a": [1, 1, 2], "b": list("xyz")})
if method == "agg":
msg = "The operation <function mean.*failed"
with tm.assert_produces_warning(FutureWarning, match=msg):
getattr(df, method)(np.mean)
# Ensure users can't pass numeric_only
with pytest.raises(TypeError, match="got an unexpected keyword argument"):
getattr(df, method)(np.mean, numeric_only=True)
elif method == "apply":
with pytest.raises(TypeError, match="Could not convert"):
getattr(df, method)(np.mean)
else:
with pytest.raises(ValueError, match="Function did not transform"):
msg = "The operation <function mean.*failed"
with tm.assert_produces_warning(FutureWarning, match=msg):
getattr(df, method)(np.mean)


@pytest.mark.parametrize("how", ["agg", "apply"])
def test_non_callable_aggregates(how):

Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1454,3 +1454,15 @@ def test_agg_of_mode_list(test, constant):
expected = expected.set_index(0)

tm.assert_frame_equal(result, expected)


def test_numeric_only_warning_numpy():
# GH#50538
df = DataFrame({"a": [1, 1, 2], "b": list("xyz"), "c": [3, 4, 5]})
gb = df.groupby("a")
msg = "The operation <function mean.*failed"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb.agg(np.mean)
# Ensure users can't pass numeric_only
with pytest.raises(TypeError, match="got an unexpected keyword argument"):
gb.agg(np.mean, numeric_only=True)
13 changes: 13 additions & 0 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1357,3 +1357,16 @@ def test_empty_df(method, op):
)

tm.assert_series_equal(result, expected)


def test_numeric_only_warning_numpy():
# GH#50538
df = DataFrame({"a": [1, 1, 2], "b": list("xyz"), "c": [3, 4, 5]})
gb = df.groupby("a")
msg = "The operation <function mean.*failed"
# Warning is raised from within NumPy
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
gb.apply(np.mean)
# Ensure users can't pass numeric_only
with pytest.raises(TypeError, match="got an unexpected keyword argument"):
gb.apply(np.mean, numeric_only=True)
18 changes: 16 additions & 2 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,9 +486,14 @@ def test_frame_set_name_single(df):
result = df.groupby("A", as_index=False).mean()
assert result.index.name != "A"

# GH#50538
msg = "The operation <function mean.*failed"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = grouped.agg(np.mean)
assert result.index.name == "A"
# Ensure users can't pass numeric_only
with pytest.raises(TypeError, match="got an unexpected keyword argument"):
grouped.agg(np.mean, numeric_only=True)

result = grouped.agg({"C": np.mean, "D": np.std})
assert result.index.name == "A"
Expand Down Expand Up @@ -766,19 +771,24 @@ def test_as_index_series_return_frame(df):
grouped = df.groupby("A", as_index=False)
grouped2 = df.groupby(["A", "B"], as_index=False)

msg = "The default value of numeric_only"
# GH#50538
msg = "The operation <function sum.*failed"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = grouped["C"].agg(np.sum)
expected = grouped.agg(np.sum).loc[:, ["A", "C"]]
assert isinstance(result, DataFrame)
tm.assert_frame_equal(result, expected)
# Ensure users can't pass numeric_only
with pytest.raises(TypeError, match="got an unexpected keyword argument"):
grouped.agg(np.mean, numeric_only=True)

result2 = grouped2["C"].agg(np.sum)
expected2 = grouped2.agg(np.sum).loc[:, ["A", "B", "C"]]
assert isinstance(result2, DataFrame)
tm.assert_frame_equal(result2, expected2)

result = grouped["C"].sum()
msg = "The default value of numeric_only"
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = grouped.sum().loc[:, ["A", "C"]]
assert isinstance(result, DataFrame)
Expand Down Expand Up @@ -1021,10 +1031,14 @@ def test_wrap_aggregated_output_multindex(mframe):
df["baz", "two"] = "peekaboo"

keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
msg = "The default value of numeric_only"
# GH#50538
msg = "The operation <function mean.*failed"
with tm.assert_produces_warning(FutureWarning, match=msg):
agged = df.groupby(keys).agg(np.mean)
assert isinstance(agged.columns, MultiIndex)
# Ensure users can't pass numeric_only
with pytest.raises(TypeError, match="got an unexpected keyword argument"):
df.groupby(keys).agg(np.mean, numeric_only=True)

def aggfun(ser):
if ser.name == ("foo", "one"):
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1563,3 +1563,18 @@ def test_as_index_no_change(keys, df, groupby_func):
result = gb_as_index_true.transform(groupby_func, *args)
expected = gb_as_index_false.transform(groupby_func, *args)
tm.assert_equal(result, expected)


@pytest.mark.parametrize("func", [np.mean, np.cumprod])
def test_numeric_only_warning_numpy(func):
# GH#50538
df = DataFrame({"a": [1, 1, 2], "b": list("xyz"), "c": [3, 4, 5]})
gb = df.groupby("a")
msg = "The default value of numeric_only"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb.transform(func)
# Ensure users can pass numeric_only
result = gb.transform(func, numeric_only=True)
values = [3.5, 3.5, 5.0] if func == np.mean else [3, 12, 5]
expected = DataFrame({"c": values})
tm.assert_frame_equal(result, expected)
21 changes: 21 additions & 0 deletions pandas/tests/resample/test_resample_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -938,3 +938,24 @@ def test_series_downsample_method(method, numeric_only, expected_data):
result = func(numeric_only=numeric_only)
expected = Series(expected_data, index=expected_index)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("method", ["agg", "apply", "transform"])
def test_numeric_only_warning_numpy(method):
# GH#50538
resampled = _test_frame.assign(D="x").resample("H")
if method == "transform":
msg = "The default value of numeric_only"
with tm.assert_produces_warning(FutureWarning, match=msg):
getattr(resampled, method)(np.mean)
# Ensure users can pass numeric_only
result = getattr(resampled, method)(np.mean, numeric_only=True)
expected = resampled.transform("mean", numeric_only=True)
tm.assert_frame_equal(result, expected)
else:
msg = "The operation <function mean.*failed"
with tm.assert_produces_warning(FutureWarning, match=msg):
getattr(resampled, method)(np.mean)
# Ensure users can't pass numeric_only
with pytest.raises(TypeError, match="got an unexpected keyword argument"):
getattr(resampled, method)(np.mean, numeric_only=True)
13 changes: 10 additions & 3 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ def test_pivot_table_nocols(self):
df = DataFrame(
{"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]}
)
msg = "pivot_table dropped a column because it failed to aggregate"
# GH#50538
msg = "The operation <function sum.*failed"
with tm.assert_produces_warning(FutureWarning, match=msg):
rs = df.pivot_table(columns="cols", aggfunc=np.sum)
xp = df.pivot_table(index="cols", aggfunc=np.sum).T
Expand Down Expand Up @@ -907,7 +908,8 @@ def test_no_col(self):

# to help with a buglet
self.data.columns = [k * 2 for k in self.data.columns]
msg = "pivot_table dropped a column because it failed to aggregate"
# GH#50538
msg = "The operation <function mean.*failed"
with tm.assert_produces_warning(FutureWarning, match=msg):
table = self.data.pivot_table(
index=["AA", "BB"], margins=True, aggfunc=np.mean
Expand All @@ -916,6 +918,7 @@ def test_no_col(self):
totals = table.loc[("All", ""), value_col]
assert totals == self.data[value_col].mean()

msg = "pivot_table dropped a column because it failed to aggregate"
with tm.assert_produces_warning(FutureWarning, match=msg):
table = self.data.pivot_table(
index=["AA", "BB"], margins=True, aggfunc="mean"
Expand Down Expand Up @@ -975,7 +978,11 @@ def test_margin_with_only_columns_defined(
}
)

msg = "pivot_table dropped a column because it failed to aggregate"
if aggfunc == "sum":
msg = "pivot_table dropped a column because it failed to aggregate"
else:
# GH#50538
msg = "The operation <function mean.*failed"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc)
expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns)
Expand Down