diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index ef6d45fa0140b..7a55acbd3031d 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1000,6 +1000,7 @@ instance method on each data group. This is pretty easy to do by passing lambda functions: .. ipython:: python + :okwarning: grouped = df.groupby("A") grouped.agg(lambda x: x.std()) @@ -1009,6 +1010,7 @@ arguments. Using a bit of metaprogramming cleverness, GroupBy now has the ability to "dispatch" method calls to the groups: .. ipython:: python + :okwarning: grouped.std() diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d357e4a633347..e2e05d98845f6 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -720,6 +720,44 @@ For example: A 24 dtype: int64 + +Similarly, when applying a function to :class:`DataFrameGroupBy`, columns on which +the function raises ``TypeError`` are currently silently ignored and dropped +from the result. + +This behavior is deprecated. In a future version, the ``TypeError`` +will be raised, and users will need to select only valid columns before calling +the function. + +For example: + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2, 3, 4], "B": pd.date_range("2016-01-01", periods=4)}) + gb = df.groupby([1, 1, 2, 2]) + +*Old behavior*: + +.. code-block:: ipython + + In [4]: gb.prod(numeric_only=False) + Out[4]: + A + 1 2 + 2 12 + +.. code-block:: ipython + + In [5]: gb.prod(numeric_only=False) + ... + TypeError: datetime64 type does not support prod operations + + In [6]: gb[["A"]].prod(numeric_only=False) + Out[6]: + A + 1 2 + 2 12 + .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c38c51d46f83e..dec68ab8f392d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1087,6 +1087,15 @@ def array_func(values: ArrayLike) -> ArrayLike: if not len(new_mgr) and len(orig): # If the original Manager was already empty, no need to raise raise DataError("No numeric types to aggregate") + if len(new_mgr) < len(data): + warnings.warn( + f"Dropping invalid columns in {type(self).__name__}.{how} " + "is deprecated. In a future version, a TypeError will be raised. " + f"Before calling .{how}, select only columns which should be " + "valid for the function.", + FutureWarning, + stacklevel=4, + ) return self._wrap_agged_manager(new_mgr) @@ -1283,6 +1292,16 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True) res_mgr.set_axis(1, mgr.axes[1]) + if len(res_mgr) < len(mgr): + warnings.warn( + f"Dropping invalid columns in {type(self).__name__}.{how} " + "is deprecated. In a future version, a TypeError will be raised. " + f"Before calling .{how}, select only columns which should be " + "valid for the transforming function.", + FutureWarning, + stacklevel=4, + ) + res_df = self.obj._constructor(res_mgr) if self.axis == 1: res_df = res_df.T @@ -1420,7 +1439,14 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: output[i] = sgb.transform(wrapper) except TypeError: # e.g. trying to call nanmean with string values - pass + warnings.warn( + f"Dropping invalid columns in {type(self).__name__}.transform " + "is deprecated. In a future version, a TypeError will be raised. " + "Before calling .transform, select only columns which should be " + "valid for the transforming function.", + FutureWarning, + stacklevel=5, + ) else: inds.append(i) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b27eb4bb8f325..3ce9b9bececf0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -30,6 +30,7 @@ class providing the base-class of operations. Union, cast, ) +import warnings import numpy as np @@ -1270,6 +1271,14 @@ def _python_agg_general(self, func, *args, **kwargs): # if this function is invalid for this dtype, we will ignore it. result = self.grouper.agg_series(obj, f) except TypeError: + warnings.warn( + f"Dropping invalid columns in {type(self).__name__}.agg " + "is deprecated. In a future version, a TypeError will be raised. " + "Before calling .agg, select only columns which should be " + "valid for the aggregating function.", + FutureWarning, + stacklevel=3, + ) continue key = base.OutputKey(label=name, position=idx) @@ -2829,6 +2838,16 @@ def _get_cythonized_result( vals, inferences = pre_processing(vals) except TypeError as err: error_msg = str(err) + howstr = how.replace("group_", "") + warnings.warn( + "Dropping invalid columns in " + f"{type(self).__name__}.{howstr} is deprecated. " + "In a future version, a TypeError will be raised. " + f"Before calling .{howstr}, select only columns which " + "should be valid for the function.", + FutureWarning, + stacklevel=3, + ) continue vals = vals.astype(cython_dtype, copy=False) if needs_2d: diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 18739320d90d3..eb82e03aea82f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -257,7 +257,8 @@ def func(ser): else: return ser.sum() - result = grouped.aggregate(func) + with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): + result = grouped.aggregate(func) exp_grouped = three_group.loc[:, three_group.columns != "C"] expected = exp_grouped.groupby(["A", "B"]).aggregate(func) tm.assert_frame_equal(result, expected) @@ -1020,6 +1021,7 @@ def test_mangle_series_groupby(self): tm.assert_frame_equal(result, expected) @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.") + @pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning") def test_with_kwargs(self): f1 = lambda x, y, b=1: x.sum() + y + b f2 = lambda x, y, b=2: x.sum() + y * b diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 681192881c301..4d30543355d47 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -44,9 +44,16 @@ def test_agg_api(): def peak_to_peak(arr): return arr.max() - arr.min() - expected = grouped.agg([peak_to_peak]) + with tm.assert_produces_warning( + FutureWarning, match="Dropping invalid", check_stacklevel=False + ): + expected = grouped.agg([peak_to_peak]) expected.columns = ["data1", "data2"] - result = grouped.agg(peak_to_peak) + + with tm.assert_produces_warning( + FutureWarning, match="Dropping invalid", check_stacklevel=False + ): + result = grouped.agg(peak_to_peak) tm.assert_frame_equal(result, expected) @@ -294,7 +301,8 @@ def raiseException(df): raise TypeError("test") with pytest.raises(TypeError, match="test"): - df.groupby(0).agg(raiseException) + with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): + df.groupby(0).agg(raiseException) def test_series_agg_multikey(): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 4fa21a259e7cb..95bb010015f62 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -87,13 +87,15 @@ def test_max_min_object_multiple_columns(using_array_manager): gb = df.groupby("A") - result = gb.max(numeric_only=False) + with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): + result = gb.max(numeric_only=False) # "max" is valid for column "C" but not for "B" ei = Index([1, 2, 3], name="A") expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) tm.assert_frame_equal(result, expected) - result = gb.min(numeric_only=False) + with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): + result = gb.min(numeric_only=False) # "min" is valid for column "C" but not for "B" ei = Index([1, 2, 3], name="A") expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) @@ -221,7 +223,10 @@ def test_averages(self, df, method): ], ) - result = getattr(gb, method)(numeric_only=False) + with tm.assert_produces_warning( + FutureWarning, match="Dropping invalid", check_stacklevel=False + ): + result = getattr(gb, method)(numeric_only=False) tm.assert_frame_equal(result.reindex_like(expected), expected) expected_columns = expected.columns @@ -303,10 +308,27 @@ def test_cummin_cummax(self, df, method): def _check(self, df, method, expected_columns, expected_columns_numeric): gb = df.groupby("group") - result = getattr(gb, method)() + # cummin, cummax dont have numeric_only kwarg, always use False + warn = None + if method in ["cummin", "cummax"]: + # these dont have numeric_only kwarg, always use False + warn = FutureWarning + elif method in ["min", "max"]: + # these have numeric_only kwarg, but default to False + warn = FutureWarning + + with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + result = getattr(gb, method)() + tm.assert_index_equal(result.columns, expected_columns_numeric) - result = getattr(gb, method)(numeric_only=False) + # GH#41475 deprecated silently ignoring nuisance columns + warn = None + if len(expected_columns) < len(gb._obj_with_exclusions.columns): + warn = FutureWarning + with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + result = getattr(gb, method)(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c37dc17b85dd2..29402d6b8d538 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -923,7 +923,8 @@ def aggfun(ser): else: return ser.sum() - agged2 = df.groupby(keys).aggregate(aggfun) + with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): + agged2 = df.groupby(keys).aggregate(aggfun) assert len(agged2.columns) + 1 == len(df.columns) @@ -1757,6 +1758,7 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"] ) +@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning") @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") def test_empty_groupby(columns, keys, values, method, op, request): # GH8093 & GH26411 diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 9c9d1aa881890..90437b9139594 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -155,7 +155,10 @@ def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): - df.groupby("key").quantile() + with tm.assert_produces_warning( + FutureWarning, match="Dropping invalid columns" + ): + df.groupby("key").quantile() def test_quantile_out_of_bounds_q_raises(): @@ -236,7 +239,11 @@ def test_groupby_quantile_nullable_array(values, q): @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) def test_groupby_quantile_skips_invalid_dtype(q): df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) - result = df.groupby("a").quantile(q) + + warn = None if isinstance(q, list) else FutureWarning + with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + result = df.groupby("a").quantile(q) + expected = df.groupby("a")[["b"]].quantile(q) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 09317cbeec658..1949d03998512 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -409,7 +409,9 @@ def test_transform_exclude_nuisance(df, duplicates): grouped = df.groupby("A") gbc = grouped["C"] - expected["C"] = gbc.transform(np.mean) + warn = FutureWarning if duplicates else None + with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + expected["C"] = gbc.transform(np.mean) if duplicates: # squeeze 1-column DataFrame down to Series expected["C"] = expected["C"]["C"] @@ -422,14 +424,16 @@ def test_transform_exclude_nuisance(df, duplicates): expected["D"] = grouped["D"].transform(np.mean) expected = DataFrame(expected) - result = df.groupby("A").transform(np.mean) + with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): + result = df.groupby("A").transform(np.mean) tm.assert_frame_equal(result, expected) def test_transform_function_aliases(df): - result = df.groupby("A").transform("mean") - expected = df.groupby("A").transform(np.mean) + with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): + result = df.groupby("A").transform("mean") + expected = df.groupby("A").transform(np.mean) tm.assert_frame_equal(result, expected) result = df.groupby("A")["C"].transform("mean") @@ -498,7 +502,10 @@ def test_groupby_transform_with_int(): } ) with np.errstate(all="ignore"): - result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + with tm.assert_produces_warning( + FutureWarning, match="Dropping invalid columns" + ): + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) expected = DataFrame( {"B": np.nan, "C": Series([-1, 0, 1, -1, 0, 1], dtype="float64")} ) @@ -514,7 +521,10 @@ def test_groupby_transform_with_int(): } ) with np.errstate(all="ignore"): - result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + with tm.assert_produces_warning( + FutureWarning, match="Dropping invalid columns" + ): + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) expected = DataFrame({"B": np.nan, "C": [-1.0, 0.0, 1.0, -1.0, 0.0, 1.0]}) tm.assert_frame_equal(result, expected) @@ -522,7 +532,10 @@ def test_groupby_transform_with_int(): s = Series([2, 3, 4, 10, 5, -1]) df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"}) with np.errstate(all="ignore"): - result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + with tm.assert_produces_warning( + FutureWarning, match="Dropping invalid columns" + ): + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) s1 = s.iloc[0:3] s1 = (s1 - s1.mean()) / s1.std() @@ -532,7 +545,8 @@ def test_groupby_transform_with_int(): tm.assert_frame_equal(result, expected) # int doesn't get downcasted - result = df.groupby("A").transform(lambda x: x * 2 / 2) + with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): + result = df.groupby("A").transform(lambda x: x * 2 / 2) expected = DataFrame({"B": 1.0, "C": [2.0, 3.0, 4.0, 10.0, 5.0, -1.0]}) tm.assert_frame_equal(result, expected) @@ -791,7 +805,11 @@ def test_transform_numeric_ret(cols, exp, comp_func, agg_func, request): {"a": date_range("2018-01-01", periods=3), "b": range(3), "c": range(7, 10)} ) - result = df.groupby("b")[cols].transform(agg_func) + warn = FutureWarning + if isinstance(exp, Series) or agg_func != "size": + warn = None + with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + result = df.groupby("b")[cols].transform(agg_func) if agg_func == "rank": exp = exp.astype("float") @@ -1103,7 +1121,12 @@ def test_transform_agg_by_name(request, reduction_func, obj): args = {"nth": [0], "quantile": [0.5], "corrwith": [obj]}.get(func, []) - result = g.transform(func, *args) + warn = None + if isinstance(obj, DataFrame) and func == "size": + warn = FutureWarning + + with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + result = g.transform(func, *args) # this is the *definition* of a transformation tm.assert_index_equal(result.index, obj.index)