Skip to content

Commit

Permalink
REGR: fix case all-NaN/numeric object column in groupby (#39655)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche committed Feb 8, 2021
1 parent d4eee37 commit e58a193
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 4 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v1.2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ Fixed regressions
- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
- Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`)
- Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`)
- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
- Fixed regression in :meth:`~DataFrame.groupby` or :meth:`~DataFrame.resample` when aggregating an all-NaN or numeric object dtype column (:issue:`39329`)
- Fixed regression in :meth:`.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)

.. ---------------------------------------------------------------------------
Expand Down
11 changes: 8 additions & 3 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1102,11 +1102,16 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike:
assert isinstance(result, (Series, DataFrame)) # for mypy
mgr = result._mgr
assert isinstance(mgr, BlockManager)
assert len(mgr.blocks) == 1

# unwrap DataFrame to get array
result = mgr.blocks[0].values
return result
if len(mgr.blocks) != 1:
# We've split an object block! Everything we've assumed
# about a single block input returning a single block output
# is a lie. See eg GH-39329
return mgr.as_array()
else:
result = mgr.blocks[0].values
return result

def blk_func(bvalues: ArrayLike) -> ArrayLike:

Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1187,3 +1187,27 @@ def test_aggregate_datetime_objects():
result = df.groupby("A").B.max()
expected = df.set_index("A")["B"]
tm.assert_series_equal(result, expected)


def test_aggregate_numeric_object_dtype():
# https://github.com/pandas-dev/pandas/issues/39329
# simplified case: multiple object columns where one is all-NaN
# -> gets split as the all-NaN is inferred as float
df = DataFrame(
{"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4},
).astype(object)
result = df.groupby("key").min()
expected = DataFrame(
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}
).set_index("key")
tm.assert_frame_equal(result, expected)

# same but with numbers
df = DataFrame(
{"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)},
).astype(object)
result = df.groupby("key").min()
expected = DataFrame(
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}
).set_index("key")
tm.assert_frame_equal(result, expected)
31 changes: 31 additions & 0 deletions pandas/tests/resample/test_resampler_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,3 +392,34 @@ def test_resample_groupby_agg():
result = resampled.agg({"num": "sum"})

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("consolidate", [True, False])
def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
# https://github.com/pandas-dev/pandas/issues/39329

dates = pd.date_range("2020-01-01", periods=15, freq="D")
df1 = DataFrame({"key": "A", "date": dates, "col1": range(15), "col_object": "val"})
df2 = DataFrame({"key": "B", "date": dates, "col1": range(15)})
df = pd.concat([df1, df2], ignore_index=True)
if consolidate:
df = df._consolidate()

result = df.groupby(["key"]).resample("W", on="date").min()
idx = pd.MultiIndex.from_arrays(
[
["A"] * 3 + ["B"] * 3,
pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2),
],
names=["key", "date"],
)
expected = DataFrame(
{
"key": ["A"] * 3 + ["B"] * 3,
"date": pd.to_datetime(["2020-01-01", "2020-01-06", "2020-01-13"] * 2),
"col1": [0, 5, 12] * 2,
"col_object": ["val"] * 3 + [np.nan] * 3,
},
index=idx,
)
tm.assert_frame_equal(result, expected)

0 comments on commit e58a193

Please sign in to comment.