diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 5ffc1a20b382f..952dda8665e1c 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -26,6 +26,7 @@ Fixed regressions - Fixed regression in :func:`is_list_like` where objects with ``__iter__`` set to ``None`` would be identified as iterable (:issue:`43373`) - Fixed regression in :meth:`.Resampler.aggregate` when used after column selection would raise if ``func`` is a list of aggregation functions (:issue:`42905`) - Fixed regression in :meth:`DataFrame.corr` where Kendall correlation would produce incorrect results for columns with repeated values (:issue:`43401`) +- Fixed regression in :meth:`DataFrame.groupby` where aggregation on columns with object types dropped results on those columns (:issue:`42395`, :issue:`43108`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8022d967a90d3..2da42b3fa11e7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1152,11 +1152,14 @@ def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = Fals def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: """ Determine subclass-specific default value for 'numeric_only'. + For SeriesGroupBy we want the default to be False (to match Series behavior). For DataFrameGroupBy we want it to be True (for backwards-compat). + Parameters ---------- numeric_only : bool or lib.no_default + Returns ------- bool @@ -1167,14 +1170,19 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: if self.obj.ndim == 2: # i.e. DataFrameGroupBy numeric_only = True + # GH#42395 GH#43108 GH#43154 + # Regression from 1.2.5 to 1.3 caused object columns to be dropped obj = self._obj_with_exclusions check = obj._get_numeric_data() if len(obj.columns) and not len(check.columns) and not obj.empty: numeric_only = False + # TODO: v1.4+ Add FutureWarning else: numeric_only = False - return numeric_only + # error: Incompatible return value type (got "Union[bool, NoDefault]", + # expected "bool") + return numeric_only # type: ignore[return-value] @cache_readonly def _group_keys_index(self) -> Index: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 33066e848c8ef..3ae11847cc06b 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -10,10 +10,8 @@ from pandas import ( DataFrame, Index, - Int64Index, MultiIndex, Series, - Timedelta, Timestamp, date_range, ) @@ -264,64 +262,6 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): tm.assert_index_equal(result.columns, expected_columns) - def test_groupby_aggregation_non_numeric_dtype(self): - # GH #43108 - df = DataFrame( - [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"] - ) - - expected = DataFrame( - { - "v": [[1, 1], [10, 20]], - }, - index=Index(["M", "W"], dtype="object", name="MW"), - ) - - gb = df.groupby(by=["MW"]) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - def test_groupby_aggregation_multi_non_numeric_dtype(self): - # GH #42395 - df = DataFrame( - { - "x": [1, 0, 1, 1, 0], - "y": [Timedelta(i, "days") for i in range(1, 6)], - "z": [Timedelta(i * 10, "days") for i in range(1, 6)], - } - ) - - expected = DataFrame( - { - "y": [Timedelta(i, "days") for i in range(7, 9)], - "z": [Timedelta(i * 10, "days") for i in range(7, 9)], - }, - index=Int64Index([0, 1], dtype="int64", name="x"), - ) - - gb = df.groupby(by=["x"]) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - def test_groupby_aggregation_numeric_with_non_numeric_dtype(self): - # GH #43108 - df = DataFrame( - { - "x": [1, 0, 1, 1, 0], - "y": [Timedelta(i, "days") for i in range(1, 6)], - "z": [i for i in range(1, 6)], - } - ) - - expected = DataFrame( - {"z": [7, 8]}, - index=Int64Index([0, 1], dtype="int64", name="x"), - ) - - gb = df.groupby(by=["x"]) - result = gb.sum() - tm.assert_frame_equal(result, expected) - class TestGroupByNonCythonPaths: # GH#5610 non-cython calls should not include the grouper diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f26f18c9c20a0..7e8a0157f3b5c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -14,9 +14,11 @@ DataFrame, Grouper, Index, + Int64Index, MultiIndex, RangeIndex, Series, + Timedelta, Timestamp, date_range, read_csv, @@ -2392,6 +2394,67 @@ def test_groupby_empty_multi_column(as_index, numeric_only): tm.assert_frame_equal(result, expected) +def test_groupby_aggregation_non_numeric_dtype(): + # GH #43108 + df = DataFrame( + [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"] + ) + + expected = DataFrame( + { + "v": [[1, 1], [10, 20]], + }, + index=Index(["M", "W"], dtype="object", name="MW"), + ) + + gb = df.groupby(by=["MW"]) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_multi_non_numeric_dtype(): + # GH #42395 + df = DataFrame( + { + "x": [1, 0, 1, 1, 0], + "y": [Timedelta(i, "days") for i in range(1, 6)], + "z": [Timedelta(i * 10, "days") for i in range(1, 6)], + } + ) + + expected = DataFrame( + { + "y": [Timedelta(i, "days") for i in range(7, 9)], + "z": [Timedelta(i * 10, "days") for i in range(7, 9)], + }, + index=Int64Index([0, 1], dtype="int64", name="x"), + ) + + gb = df.groupby(by=["x"]) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_numeric_with_non_numeric_dtype(): + # GH #43108 + df = DataFrame( + { + "x": [1, 0, 1, 1, 0], + "y": [Timedelta(i, "days") for i in range(1, 6)], + "z": list(range(1, 6)), + } + ) + + expected = DataFrame( + {"z": [7, 8]}, + index=Int64Index([0, 1], dtype="int64", name="x"), + ) + + gb = df.groupby(by=["x"]) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + def test_groupby_filtered_df_std(): # GH 16174 dicts = [