diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1556c88aaecc6..b36499c340fd9 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -1061,6 +1061,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`) - Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`) - Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype, e.g. ``.prod`` with ``datetime64[ns]`` dtype (:issue:`41342`) +- Bug in :class:`DataFrameGroupBy` aggregations incorrectly failing to drop columns with invalid dtypes for that aggregation when there are no valid columns (:issue:`41291`) - Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`) - Bug in :meth:`DataFrameGroupBy.transform` and :meth:`DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7a286188c4e74..b72b927b3c2a8 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -516,7 +516,7 @@ def group_add(add_t[:, ::1] out, val = values[i, j] # not nan - if val == val: + if not checknull(val): nobs[lab, j] += 1 if nobs[lab, j] == 1: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b51fb2234e148..69f992f840c7c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -67,10 +67,7 @@ validate_func_kwargs, ) from pandas.core.apply import GroupByApply -from pandas.core.base import ( - DataError, - SpecificationError, -) +from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame @@ -516,16 +513,12 @@ def _cython_transform( obj = self._selected_obj - is_numeric = is_numeric_dtype(obj.dtype) - if numeric_only and not is_numeric: - raise DataError("No numeric types to aggregate") - try: result = self.grouper._cython_operation( "transform", obj._values, how, axis, **kwargs ) - except (NotImplementedError, TypeError): - raise DataError("No numeric types to aggregate") + except NotImplementedError as err: + raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err return obj._constructor(result, index=self.obj.index, name=obj.name) @@ -1064,7 +1057,6 @@ def _cython_agg_general( # Note: we never get here with how="ohlc"; that goes through SeriesGroupBy data: Manager2D = self._get_data_to_aggregate() - orig = data if numeric_only: data = data.get_numeric_data(copy=False) @@ -1087,9 +1079,6 @@ def array_func(values: ArrayLike) -> ArrayLike: # continue and exclude the block new_mgr = data.grouped_reduce(array_func, ignore_failures=True) - if not len(new_mgr) and len(orig): - # If the original Manager was already empty, no need to raise - raise DataError("No numeric types to aggregate") if len(new_mgr) < len(data): warnings.warn( f"Dropping invalid columns in {type(self).__name__}.{how} " diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b00a1160fb01b..6deb5bb1a76f0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1339,20 +1339,12 @@ def _agg_general( with group_selection_context(self): # try a cython aggregation if we can - result = None - try: - result = self._cython_agg_general( - how=alias, - alt=npfunc, - numeric_only=numeric_only, - min_count=min_count, - ) - except DataError: - pass - - # apply a non-cython aggregation - if result is None: - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + result = self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + ) return result.__finalize__(self.obj, method="groupby") def _agg_py_fallback( diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index eb82e03aea82f..851dd7311183f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -128,8 +128,9 @@ def test_groupby_aggregation_multi_level_column(): columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]), ) - result = df.groupby(level=1, axis=1).sum() - expected = DataFrame({0: [2.0, 1, 1, 1], 1: [1, 0, 1, 1]}) + gb = df.groupby(level=1, axis=1) + result = gb.sum(numeric_only=False) + expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index cf1177d231e37..a035c5500e2dc 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -18,7 +18,6 @@ bdate_range, ) import pandas._testing as tm -from pandas.core.groupby.groupby import DataError @pytest.mark.parametrize( @@ -98,9 +97,9 @@ def test_cython_agg_nothing_to_agg(): frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) - msg = "No numeric types to aggregate" - with pytest.raises(DataError, match=msg): - frame[["b"]].groupby(frame["a"]).mean() + result = frame[["b"]].groupby(frame["a"]).mean() + expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates()) + tm.assert_frame_equal(result, expected) def test_cython_agg_nothing_to_agg_with_dates(): diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 4d30543355d47..79990deed261d 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -433,15 +433,22 @@ def test_agg_over_numpy_arrays(): ], columns=["category", "arraydata"], ) - result = df.groupby("category").agg(sum) + gb = df.groupby("category") expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] expected_index = Index([1, 2], name="category") expected_column = ["arraydata"] expected = DataFrame(expected_data, index=expected_index, columns=expected_column) + alt = gb.sum(numeric_only=False) + tm.assert_frame_equal(alt, expected) + + result = gb.agg("sum", numeric_only=False) tm.assert_frame_equal(result, expected) + # FIXME: the original version of this test called `gb.agg(sum)` + # and that raises TypeError if `numeric_only=False` is passed + @pytest.mark.parametrize("as_period", [True, False]) def test_agg_tzaware_non_datetime_result(as_period): @@ -524,9 +531,14 @@ def test_sum_uint64_overflow(): ) expected.index.name = 0 - result = df.groupby(0).sum() + result = df.groupby(0).sum(numeric_only=False) tm.assert_frame_equal(result, expected) + # out column is non-numeric, so with numeric_only=True it is dropped + result2 = df.groupby(0).sum(numeric_only=True) + expected2 = expected[[]] + tm.assert_frame_equal(result2, expected2) + @pytest.mark.parametrize( "structure, expected", diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 719fdb353e3cf..382a940d2a92c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -638,7 +638,7 @@ def test_as_index_select_column(): def test_groupby_as_index_select_column_sum_empty_df(): # GH 35246 df = DataFrame(columns=["A", "B", "C"]) - left = df.groupby(by="A", as_index=False)["B"].sum() + left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False) assert type(left) is DataFrame assert left.to_dict() == {"A": {}, "B": {}} @@ -1861,6 +1861,49 @@ def get_result(): get_result() return + else: + # ie. DataFrameGroupBy + if op in ["prod", "sum"]: + # ops that require more than just ordered-ness + if method != "apply": + # FIXME: apply goes through different code path + if df.dtypes[0].kind == "M": + # GH#41291 + # datetime64 -> prod and sum are invalid + result = get_result() + + # with numeric_only=True, these are dropped, and we get + # an empty DataFrame back + expected = df.set_index(keys)[[]] + tm.assert_equal(result, expected) + return + + elif isinstance(values, Categorical): + # GH#41291 + # Categorical doesn't implement sum or prod + result = get_result() + + # with numeric_only=True, these are dropped, and we get + # an empty DataFrame back + expected = df.set_index(keys)[[]] + if len(keys) != 1 and op == "prod": + # TODO: why just prod and not sum? + # Categorical is special without 'observed=True' + lev = Categorical([0], dtype=values.dtype) + mi = MultiIndex.from_product([lev, lev], names=["A", "B"]) + expected = DataFrame([], columns=[], index=mi) + + tm.assert_equal(result, expected) + return + + elif df.dtypes[0] == object: + # FIXME: the test is actually wrong here, xref #41341 + result = get_result() + # In this case we have list-of-list, will raise TypeError, + # and subsequently be dropped as nuisance columns + expected = df.set_index(keys)[[]] + tm.assert_equal(result, expected) + return result = get_result() expected = df.set_index(keys)[columns] @@ -2313,12 +2356,17 @@ def test_groupby_all_nan_groups_drop(): def test_groupby_empty_multi_column(): # GH 15106 - result = DataFrame(data=[], columns=["A", "B", "C"]).groupby(["A", "B"]).sum() + df = DataFrame(data=[], columns=["A", "B", "C"]) + gb = df.groupby(["A", "B"]) + result = gb.sum(numeric_only=False) expected = DataFrame( [], columns=["C"], index=MultiIndex([[], []], [[], []], names=["A", "B"]) ) tm.assert_frame_equal(result, expected) + result = gb.sum(numeric_only=True) + tm.assert_frame_equal(result, expected[[]]) + def test_groupby_filtered_df_std(): # GH 16174 diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index dae5c7274ffc5..9062049029e4d 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -24,7 +24,6 @@ DataFrameGroupBy, SeriesGroupBy, ) -from pandas.core.groupby.groupby import DataError def assert_fp_equal(a, b): @@ -741,11 +740,21 @@ def test_cython_transform_frame(op, args, targop): tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) # individual columns for c in df: - if c not in ["float", "int", "float_missing"] and op != "shift": - msg = "No numeric types to aggregate" - with pytest.raises(DataError, match=msg): + if ( + c not in ["float", "int", "float_missing"] + and op != "shift" + and not (c == "timedelta" and op == "cumsum") + ): + msg = "|".join( + [ + "does not support .* operations", + ".* is not supported for object dtype", + "is not implemented for this dtype", + ] + ) + with pytest.raises(TypeError, match=msg): gb[c].transform(op) - with pytest.raises(DataError, match=msg): + with pytest.raises(TypeError, match=msg): getattr(gb[c], op)() else: expected = gb[c].apply(targop)