Skip to content

Commit

Permalink
FIX-modin-project#1770: Support for groupby() with original Series in…
Browse files Browse the repository at this point in the history
… by list.

Signed-off-by: Itamar Turner-Trauring <itamar@itamarst.org>
  • Loading branch information
itamarst committed Jul 29, 2020
1 parent 02977df commit 8abd84b
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 1 deletion.
16 changes: 15 additions & 1 deletion modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,21 @@ def groupby(
by = by._query_compiler
elif is_list_like(by):
# fastpath for multi column groupby
if not isinstance(by, Series) and axis == 0 and all(o in self for o in by):
if (
not isinstance(by, Series)
and axis == 0
and all(
(
(isinstance(o, str) and (o in self))
or (isinstance(o, Series) and (o._parent is self))
)
for o in by
)
):
# We can just revert Series back to names because the parent is
# this dataframe:
by = [o.name if isinstance(o, Series) else o for o in by]

warnings.warn(
"Multi-column groupby is a new feature. "
"Please report any bugs/issues to bug_reports@modin.org."
Expand Down
28 changes: 28 additions & 0 deletions modin/pandas/test/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1134,3 +1134,31 @@ def test_to_pandas_convertion(kwargs):
by = ["a", "b"]

eval_aggregation(*create_test_dfs(data), by=by, **kwargs)


@pytest.mark.parametrize(
# When True, do df[name], otherwise just use name
"columns",
[
[(False, "a"), (False, "b"), (False, "c")],
[(False, "a"), (False, "b")],
[(True, "a"), (True, "b"), (True, "c")],
[(True, "a"), (True, "b")],
[(False, "a"), (False, "b"), (True, "c")],
[(False, "a"), (True, "c")],
],
)
def test_mixed_columns(columns):
def get_columns(df):
return [df[name] if lookup else name for (lookup, name) in columns]

data = {"a": [1, 1, 2], "b": [11, 11, 22], "c": [111, 111, 222]}

df1 = pandas.DataFrame(data)
df1 = pandas.concat([df1])
ref = df1.groupby(get_columns(df1)).size()

df2 = pd.DataFrame(data)
df2 = pd.concat([df2])
exp = df2.groupby(get_columns(df2)).size()
df_equals(ref, exp)

0 comments on commit 8abd84b

Please sign in to comment.