Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: dropping nuisance columns in DataFrame reductions #41480

Merged
merged 13 commits into from
May 21, 2021
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@ this pathological behavior (:issue:`37827`):
*New behavior*:

.. ipython:: python
:okwarning:

df.mean()

Expand All @@ -394,6 +395,7 @@ instead of casting to a NumPy array which may have different semantics (:issue:`
:issue:`28949`, :issue:`21020`).

.. ipython:: python
:okwarning:

ser = pd.Series([0, 1], dtype="category", name="A")
df = ser.to_frame()
Expand All @@ -411,6 +413,7 @@ instead of casting to a NumPy array which may have different semantics (:issue:`
*New behavior*:

.. ipython:: python
:okwarning:

df.any()

Expand Down
41 changes: 41 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,47 @@ Deprecations
- Deprecated passing arguments as positional (except for ``"method"``) in :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` (:issue:`41485`)
- Deprecated passing arguments (apart from ``value``) as positional in :meth:`DataFrame.fillna` and :meth:`Series.fillna` (:issue:`41485`)

.. _whatsnew_130.deprecations.nuisance_columns:

Deprecated Dropping Nuisance Columns in DataFrame Reductions and DataFrameGroupBy Operations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The default of calling a reduction (.min, .max, .sum, ...) on a :class:`DataFrame` with
``numeric_only=None`` (the default, columns on which the reduction raises ``TypeError``
are silently ignored and dropped from the result.

This behavior is deprecated. In a future version, the ``TypeError`` will be raised,
and users will need to select only valid columns before calling the function.

For example:

.. ipython:: python

df = pd.DataFrame({"A": [1, 2, 3, 4], "B": pd.date_range("2016-01-01", periods=4)})
df

jreback marked this conversation as resolved.
Show resolved Hide resolved
*Old behavior*:

.. code-block:: ipython

In [3]: df.prod()
Out[3]:
Out[3]:
A 24
dtype: int64

*Future behavior*:

.. code-block:: ipython

In [4]: df.prod()
...
TypeError: 'DatetimeArray' does not implement reduction 'prod'

In [5]: df[["A"]].prod()
Out[5]:
A 24
dtype: int64

.. ---------------------------------------------------------------------------


Expand Down
28 changes: 28 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9854,6 +9854,21 @@ def _get_data() -> DataFrame:
# Even if we are object dtype, follow numpy and return
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)

if numeric_only is None and out.shape[0] != df.shape[1]:
# columns have been dropped GH#41480
arg_name = "numeric_only"
if name in ["all", "any"]:
arg_name = "bool_only"
warnings.warn(
"Dropping of nuisance columns in DataFrame reductions "
f"(with '{arg_name}=None') is deprecated; in a future "
"version this will raise TypeError. Select only valid "
"columns before calling the reduction.",
FutureWarning,
stacklevel=5,
)

return out

assert numeric_only is None
Expand All @@ -9874,6 +9889,19 @@ def _get_data() -> DataFrame:
with np.errstate(all="ignore"):
result = func(values)

# columns have been dropped GH#41480
arg_name = "numeric_only"
if name in ["all", "any"]:
arg_name = "bool_only"
warnings.warn(
"Dropping of nuisance columns in DataFrame reductions "
f"(with '{arg_name}=None') is deprecated; in a future "
"version this will raise TypeError. Select only valid "
"columns before calling the reduction.",
FutureWarning,
stacklevel=5,
)

if hasattr(result, "dtype"):
if filter_type == "bool" and notna(result).all():
result = result.astype(np.bool_)
Expand Down
10 changes: 7 additions & 3 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1209,7 +1209,10 @@ def test_nuiscance_columns():
)
tm.assert_frame_equal(result, expected)

result = df.agg("sum")
with tm.assert_produces_warning(
FutureWarning, match="Select only valid", check_stacklevel=False
):
result = df.agg("sum")
expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"])
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -1426,8 +1429,9 @@ def test_apply_datetime_tz_issue():
@pytest.mark.parametrize("method", ["min", "max", "sum"])
def test_consistency_of_aggregates_of_columns_with_missing_values(df, method):
# GH 16832
none_in_first_column_result = getattr(df[["A", "B"]], method)()
none_in_second_column_result = getattr(df[["B", "A"]], method)()
with tm.assert_produces_warning(FutureWarning, match="Select only valid"):
none_in_first_column_result = getattr(df[["A", "B"]], method)()
none_in_second_column_result = getattr(df[["B", "A"]], method)()

tm.assert_series_equal(none_in_first_column_result, none_in_second_column_result)

Expand Down
1 change: 1 addition & 0 deletions pandas/tests/apply/test_invalid_arg.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ def test_transform_wont_agg_series(string_series, func):
@pytest.mark.parametrize(
"op_wrapper", [lambda x: x, lambda x: [x], lambda x: {"A": x}, lambda x: {"A": [x]}]
)
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_transform_reducer_raises(all_reductions, frame_or_series, op_wrapper):
# GH 35964
op = op_wrapper(all_reductions)
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/frame/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ def test_quantile(self, datetime_frame):
# non-numeric exclusion
df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]})
rs = df.quantile(0.5)
xp = df.median().rename(0.5)
with tm.assert_produces_warning(FutureWarning, match="Select only valid"):
xp = df.median().rename(0.5)
tm.assert_series_equal(rs, xp)

# axis
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/frame/methods/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ def test_rank_methods_frame(self):

@td.skip_array_manager_not_yet_implemented
@pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_rank_descending(self, method, dtype):

if "i" in dtype:
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,7 @@ def test_zero_len_frame_with_series_corner_cases():
tm.assert_frame_equal(result, expected)


@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_frame_single_columns_object_sum_axis_1():
# GH 13758
data = {
Expand Down
Loading