From e58a19340881d0122fdc9e3ec400adb891a1be69 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 8 Feb 2021 14:42:31 +0100 Subject: [PATCH] REGR: fix case all-NaN/numeric object column in groupby (#39655) --- doc/source/whatsnew/v1.2.2.rst | 3 +- pandas/core/groupby/generic.py | 11 +++++-- .../tests/groupby/aggregate/test_aggregate.py | 24 ++++++++++++++ .../tests/resample/test_resampler_grouper.py | 31 +++++++++++++++++++ 4 files changed, 65 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 46023e2ce08f5..60683030dca15 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -23,7 +23,8 @@ Fixed regressions - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`) - Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`) - Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`) -- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`) +- Fixed regression in :meth:`~DataFrame.groupby` or :meth:`~DataFrame.resample` when aggregating an all-NaN or numeric object dtype column (:issue:`39329`) +- Fixed regression in :meth:`.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`) - Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7b6eb4c8fe2f9..04e9eb039c249 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1102,11 +1102,16 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike: assert isinstance(result, (Series, DataFrame)) # for mypy mgr = result._mgr assert isinstance(mgr, BlockManager) - assert len(mgr.blocks) == 1 # unwrap DataFrame to get array - result = mgr.blocks[0].values - return result + if len(mgr.blocks) != 1: + # We've split an object block! Everything we've assumed + # about a single block input returning a single block output + # is a lie. See eg GH-39329 + return mgr.as_array() + else: + result = mgr.blocks[0].values + return result def blk_func(bvalues: ArrayLike) -> ArrayLike: diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index de4ef0996ad49..48527de6b2047 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1187,3 +1187,27 @@ def test_aggregate_datetime_objects(): result = df.groupby("A").B.max() expected = df.set_index("A")["B"] tm.assert_series_equal(result, expected) + + +def test_aggregate_numeric_object_dtype(): + # https://github.com/pandas-dev/pandas/issues/39329 + # simplified case: multiple object columns where one is all-NaN + # -> gets split as the all-NaN is inferred as float + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, + ).astype(object) + result = df.groupby("key").min() + expected = DataFrame( + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]} + ).set_index("key") + tm.assert_frame_equal(result, expected) + + # same but with numbers + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, + ).astype(object) + result = df.groupby("key").min() + expected = DataFrame( + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]} + ).set_index("key") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 1a10255a81a8c..39d4533ca08dc 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -392,3 +392,34 @@ def test_resample_groupby_agg(): result = resampled.agg({"num": "sum"}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("consolidate", [True, False]) +def test_resample_groupby_agg_object_dtype_all_nan(consolidate): + # https://github.com/pandas-dev/pandas/issues/39329 + + dates = pd.date_range("2020-01-01", periods=15, freq="D") + df1 = DataFrame({"key": "A", "date": dates, "col1": range(15), "col_object": "val"}) + df2 = DataFrame({"key": "B", "date": dates, "col1": range(15)}) + df = pd.concat([df1, df2], ignore_index=True) + if consolidate: + df = df._consolidate() + + result = df.groupby(["key"]).resample("W", on="date").min() + idx = pd.MultiIndex.from_arrays( + [ + ["A"] * 3 + ["B"] * 3, + pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2), + ], + names=["key", "date"], + ) + expected = DataFrame( + { + "key": ["A"] * 3 + ["B"] * 3, + "date": pd.to_datetime(["2020-01-01", "2020-01-06", "2020-01-13"] * 2), + "col1": [0, 5, 12] * 2, + "col_object": ["val"] * 3 + [np.nan] * 3, + }, + index=idx, + ) + tm.assert_frame_equal(result, expected)