-
-
Notifications
You must be signed in to change notification settings - Fork 18k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: inconsistent DataFrame.agg behavoir when passing as kwargs numeric_only=True #49534
Changes from all commits
200c9fb
8505c49
887c3c5
8fab5a8
297c53d
ca3a7c1
a4e7c1a
435347f
bf86a8d
3d55579
2a2f0c0
de584bf
61a4b50
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,7 +64,6 @@ | |
from pandas.core.resample import Resampler | ||
from pandas.core.window.rolling import BaseWindow | ||
|
||
|
||
ResType = Dict[int, Any] | ||
|
||
|
||
|
@@ -284,6 +283,12 @@ def transform_str_or_callable(self, func) -> DataFrame | Series: | |
except Exception: | ||
return func(obj, *args, **kwargs) | ||
|
||
def _maybe_filter_numeric_only(self) -> Any: | ||
if "numeric_only" in self.kwargs and self.kwargs["numeric_only"] is True: | ||
obj = self.obj._get_numeric_data() | ||
return obj | ||
return self.obj | ||
|
||
def agg_list_like(self) -> DataFrame | Series: | ||
""" | ||
Compute aggregation in the case of a list-like argument. | ||
|
@@ -294,7 +299,7 @@ def agg_list_like(self) -> DataFrame | Series: | |
""" | ||
from pandas.core.reshape.concat import concat | ||
|
||
obj = self.obj | ||
obj = self._maybe_filter_numeric_only() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will cause different behaviors for list and non-list arguments that contain a callable. E.g.
Currently in this PR, I believe the first line will pass I suggest only using the DataFrame filtered to numeric_only when a given argument in the list is a string; otherwise to use the full DataFrame and pass the numeric_only kwarg to the supplied UDF. |
||
arg = cast(List[AggFuncTypeBase], self.f) | ||
|
||
if getattr(obj, "axis", 0) == 1: | ||
|
@@ -1169,7 +1174,6 @@ def reconstruct_func( | |
|
||
if not relabeling: | ||
if isinstance(func, list) and len(func) > len(set(func)): | ||
|
||
# GH 28426 will raise error if duplicated function names are used and | ||
# there is no reassigned name | ||
raise SpecificationError( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9213,6 +9213,21 @@ def _gotitem( | |
2 8.0 | ||
3 NaN | ||
dtype: float64 | ||
|
||
Aggregate functions on DataFrames with mixed non-numeric and numeric columns. | ||
|
||
>>> df = pd.DataFrame([['a', 1, 4], | ||
... ['b', 2, 5], | ||
... ['c', 3, 6]], | ||
... columns=['A', 'B', 'C']) | ||
|
||
Works equivalently as above. Add argument `numeric_only=True` to | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does "Works equivalently as above." mean here? |
||
aggregate only numeric columns. | ||
|
||
>>> df.agg(['mean', 'std'], numeric_only=True) | ||
B C | ||
mean 2.0 5.0 | ||
std 1.0 1.0 | ||
""" | ||
) | ||
|
||
|
@@ -9377,6 +9392,9 @@ def apply( | |
Functions that mutate the passed object can produce unexpected | ||
behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` | ||
for more details. | ||
Use the keyword argument `numeric_only=True` to apply functions | ||
only to numeric columns and to skip the non-numeric columns, | ||
e.g. the column contains a string. | ||
thlautenschlaeger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Examples | ||
-------- | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -858,6 +858,22 @@ def test_with_dictlike_columns_with_infer(): | |
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_with_dictlike_functions(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're no longer changing dictlike behavior, I think this can be removed. It should fail, but that is another (unrelated) issue. |
||
# GH 49352 | ||
|
||
df = DataFrame({"a": [1, 2, 3], "b": list("abc"), "c": [3, 4, 5]}) | ||
functions = {"a": "std", "c": ["mean", "sum"]} | ||
result = df.agg(functions, numeric_only=True) | ||
expected = DataFrame( | ||
index=["std", "mean", "sum"], | ||
data={ | ||
"a": [1.0, np.nan, np.nan], | ||
"c": [np.nan, 4.0, 12.0], | ||
}, | ||
) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_with_listlike_columns(): | ||
# GH 17348 | ||
df = DataFrame( | ||
|
@@ -887,6 +903,24 @@ def test_with_listlike_columns_returning_list(): | |
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_with_listlike_functions(): | ||
# GH 49352 | ||
|
||
df = DataFrame({"a": [1, 2, 3], "b": list("abc"), "c": [3, 4, 5]}) | ||
functions = ["mean", "std"] | ||
result = df.agg(functions, numeric_only=True) | ||
expected = DataFrame(index=["mean", "std"], data={"a": [2.0, 1.0], "c": [4.0, 1.0]}) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
expected = df.agg("mean", numeric_only=True) | ||
expected.name = "mean" | ||
tm.assert_series_equal(result.loc["mean"], expected) | ||
|
||
expected = df.agg("std", numeric_only=True) | ||
expected.name = "std" | ||
tm.assert_series_equal(result.loc["std"], expected) | ||
|
||
|
||
def test_infer_output_shape_columns(): | ||
# GH 18573 | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why
is True
here? I haven't checked all, but I believe DataFrame methods use "truthy" (i.e.and self.kwargs["numeric_only"]
):There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't believe there is any need for the
is True
here. This would give surprising results if the user passes e.g.np.bool_(True)
.Also, can you use
kwargs.get("numeric_only", False)