FIX-modin-project#1770: Support for groupby() with original Series in…

… by list. Signed-off-by: Itamar Turner-Trauring <itamar@itamarst.org>
itamarst · Jul 29, 2020 · 8abd84b · 8abd84b
1 parent 02977df
commit 8abd84b
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 1 deletion.
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -440,7 +440,21 @@ def groupby(
             by = by._query_compiler
         elif is_list_like(by):
             # fastpath for multi column groupby
-            if not isinstance(by, Series) and axis == 0 and all(o in self for o in by):
+            if (
+                not isinstance(by, Series)
+                and axis == 0
+                and all(
+                    (
+                        (isinstance(o, str) and (o in self))
+                        or (isinstance(o, Series) and (o._parent is self))
+                    )
+                    for o in by
+                )
+            ):
+                # We can just revert Series back to names because the parent is
+                # this dataframe:
+                by = [o.name if isinstance(o, Series) else o for o in by]
+
                 warnings.warn(
                     "Multi-column groupby is a new feature. "
                     "Please report any bugs/issues to bug_reports@modin.org."

diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py
@@ -1134,3 +1134,31 @@ def test_to_pandas_convertion(kwargs):
     by = ["a", "b"]
 
     eval_aggregation(*create_test_dfs(data), by=by, **kwargs)
+
+
+@pytest.mark.parametrize(
+    # When True, do df[name], otherwise just use name
+    "columns",
+    [
+        [(False, "a"), (False, "b"), (False, "c")],
+        [(False, "a"), (False, "b")],
+        [(True, "a"), (True, "b"), (True, "c")],
+        [(True, "a"), (True, "b")],
+        [(False, "a"), (False, "b"), (True, "c")],
+        [(False, "a"), (True, "c")],
+    ],
+)
+def test_mixed_columns(columns):
+    def get_columns(df):
+        return [df[name] if lookup else name for (lookup, name) in columns]
+
+    data = {"a": [1, 1, 2], "b": [11, 11, 22], "c": [111, 111, 222]}
+
+    df1 = pandas.DataFrame(data)
+    df1 = pandas.concat([df1])
+    ref = df1.groupby(get_columns(df1)).size()
+
+    df2 = pd.DataFrame(data)
+    df2 = pd.concat([df2])
+    exp = df2.groupby(get_columns(df2)).size()
+    df_equals(ref, exp)