Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: RollingGroupby MultiIndex levels dropped #40701

Merged
merged 8 commits into from
Apr 5, 2021
32 changes: 32 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,38 @@ However, floating point artifacts may now exist in the results when rolling over
s = pd.Series([7, 5, 5, 5])
s.rolling(3).var()

.. _whatsnew_130.notable_bug_fixes.rolling_groupby_multiindex:

GroupBy.rolling with MultiIndex no longer drops levels in the result
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:class:`core.window.rolling.RollingGroupby` will no longer drop levels of a :class:`DataFrame`
with a :class:`MultiIndex` in the result. This can lead to a perceived duplication of levels in the resulting
:class:`MultiIndex`, but this change restores the behavior that was present in version 1.1.3 (:issue:`38787`, :issue:`38523`).


.. ipython:: python

index = pd.MultiIndex.from_tuples([('idx1', 'idx2')], names=['label1', 'label2'])
df = pd.DataFrame({'a': [1], 'b': [2]}, index=index)
df

*Previous behavior*:

.. code-block:: ipython

In [1]: df.groupby('label1').rolling(1).sum()
Out[1]:
a b
label1
idx1 1.0 2.0

*New behavior*:

.. ipython:: python

df.groupby('label1').rolling(1).sum()


.. _whatsnew_130.api_breaking.deps:

Expand Down
23 changes: 10 additions & 13 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,26 +577,23 @@ def _apply(
numba_cache_key,
**kwargs,
)
# Reconstruct the resulting MultiIndex from tuples
# Reconstruct the resulting MultiIndex
# 1st set of levels = group by labels
# 2nd set of levels = original index
# Ignore 2nd set of levels if a group by label include an index level
result_index_names = copy.copy(self._grouper.names)
grouped_object_index = None
# 2nd set of levels = original DataFrame/Series index
grouped_object_index = self.obj.index
grouped_index_name = [*grouped_object_index.names]
groupby_keys = copy.copy(self._grouper.names)
result_index_names = groupby_keys + grouped_index_name

column_keys = [
drop_columns = [
key
for key in result_index_names
for key in self._grouper.names
if key not in self.obj.index.names or key is None
]

if len(column_keys) == len(result_index_names):
grouped_object_index = self.obj.index
grouped_index_name = [*grouped_object_index.names]
result_index_names += grouped_index_name
else:
if len(drop_columns) != len(groupby_keys):
# Our result will have still kept the column in the result
result = result.drop(columns=column_keys, errors="ignore")
result = result.drop(columns=drop_columns, errors="ignore")

codes = self._grouper.codes
levels = copy.copy(self._grouper.levels)
Expand Down
47 changes: 42 additions & 5 deletions pandas/tests/window/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,23 +588,31 @@ def test_groupby_rolling_nans_in_index(self, rollings, key):
with pytest.raises(ValueError, match=f"{key} must be monotonic"):
df.groupby("c").rolling("60min", **rollings)

def test_groupby_rolling_group_keys(self):
@pytest.mark.parametrize("group_keys", [True, False])
def test_groupby_rolling_group_keys(self, group_keys):
# GH 37641
# GH 38523: GH 37641 actually was not a bug.
# group_keys only applies to groupby.apply directly
arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]]
index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))

s = Series([1, 2, 3], index=index)
result = s.groupby(["idx1", "idx2"], group_keys=False).rolling(1).mean()
result = s.groupby(["idx1", "idx2"], group_keys=group_keys).rolling(1).mean()
expected = Series(
[1.0, 2.0, 3.0],
index=MultiIndex.from_tuples(
[("val1", "val1"), ("val1", "val1"), ("val2", "val2")],
names=["idx1", "idx2"],
[
("val1", "val1", "val1", "val1"),
("val1", "val1", "val1", "val1"),
("val2", "val2", "val2", "val2"),
],
names=["idx1", "idx2", "idx1", "idx2"],
),
)
tm.assert_series_equal(result, expected)

def test_groupby_rolling_index_level_and_column_label(self):
# The groupby keys should not appear as a resulting column
arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]]
index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))

Expand All @@ -613,7 +621,12 @@ def test_groupby_rolling_index_level_and_column_label(self):
expected = DataFrame(
{"B": [0.0, 1.0, 2.0]},
index=MultiIndex.from_tuples(
[("val1", 1), ("val1", 1), ("val2", 2)], names=["idx1", "A"]
[
("val1", 1, "val1", "val1"),
("val1", 1, "val1", "val1"),
("val2", 2, "val2", "val2"),
],
names=["idx1", "A", "idx1", "idx2"],
),
)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -695,6 +708,30 @@ def test_by_column_not_in_values(self, columns):
assert "A" not in result.columns
tm.assert_frame_equal(g.obj, original_obj)

def test_groupby_level(self):
# GH 38523, 38787
arrays = [
["Falcon", "Falcon", "Parrot", "Parrot"],
["Captive", "Wild", "Captive", "Wild"],
]
index = MultiIndex.from_arrays(arrays, names=("Animal", "Type"))
df = DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index)
result = df.groupby(level=0)["Max Speed"].rolling(2).sum()
expected = Series(
[np.nan, 740.0, np.nan, 50.0],
index=MultiIndex.from_tuples(
[
("Falcon", "Falcon", "Captive"),
("Falcon", "Falcon", "Wild"),
("Parrot", "Parrot", "Captive"),
("Parrot", "Parrot", "Wild"),
],
names=["Animal", "Animal", "Type"],
),
name="Max Speed",
)
tm.assert_series_equal(result, expected)


class TestExpanding:
def setup_method(self):
Expand Down