modin-project · dchigarev · Nov 14, 2020 · Nov 13, 2020 · Nov 13, 2020 · Nov 14, 2020
@@ -2606,11 +2606,15 @@ def groupby_agg_builder(df):
             def compute_groupby(df):
                 grouped_df = df.groupby(by=by, axis=axis, **groupby_kwargs)
                 try:
-                    result = (
-                        grouped_df.agg(agg_func)
-                        if isinstance(agg_func, dict)
-                        else agg_func(grouped_df, **agg_kwargs)
-                    )
+                    if isinstance(agg_func, dict):
+                        # Filter our keys that don't exist in this partition. This happens when some columns
+                        # from this original dataframe didn't end up in every partition.
+                        partition_dict = {
+                            k: v for k, v in agg_func.items() if k in df.columns
+                        }
+                        result = grouped_df.agg(partition_dict)
+                    else:
+                        result = agg_func(grouped_df, **agg_kwargs)
                 # This happens when the partition is filled with non-numeric data and a
                 # numeric operation is done. We need to build the index here to avoid
                 # issues with extracting the index.

@@ -22,6 +22,7 @@
     check_df_columns_have_nans,
     create_test_dfs,
     eval_general,
+    test_data,
     test_data_values,
     modin_df_almost_equals_pandas,
 )
@@ -1189,23 +1190,36 @@ def test_shift_freq(groupby_axis, shift_axis):
         )
 
 
-def test_agg_func_None_rename():
-    pandas_df = pandas.DataFrame(
+@pytest.mark.parametrize(
+    "by_and_agg_dict",
+    [
         {
-            "col1": np.random.randint(0, 100, size=1000),
-            "col2": np.random.randint(0, 100, size=1000),
-            "col3": np.random.randint(0, 100, size=1000),
-            "col4": np.random.randint(0, 100, size=1000),
+            "by": [
+                list(test_data["int_data"].keys())[0],
+                list(test_data["int_data"].keys())[1],
+            ],
+            "agg_dict": {
+                "max": (list(test_data["int_data"].keys())[2], np.max),
+                "min": (list(test_data["int_data"].keys())[2], np.min),
+            },
         },
-        index=["row{}".format(i) for i in range(1000)],
-    )
-    modin_df = from_pandas(pandas_df)
+        {
+            "by": ["col1"],
+            "agg_dict": {
+                "max": (list(test_data["int_data"].keys())[0], np.max),
+                "min": (list(test_data["int_data"].keys())[-1], np.min),
+            },
+        },
+    ],
+)
+def test_agg_func_None_rename(by_and_agg_dict):
+    modin_df, pandas_df = create_test_dfs(test_data["int_data"])
 
-    modin_result = modin_df.groupby(["col1", "col2"]).agg(
-        max=("col3", np.max), min=("col3", np.min)
+    modin_result = modin_df.groupby(by_and_agg_dict["by"]).agg(
+        **by_and_agg_dict["agg_dict"]
     )
-    pandas_result = pandas_df.groupby(["col1", "col2"]).agg(
-        max=("col3", np.max), min=("col3", np.min)
+    pandas_result = pandas_df.groupby(by_and_agg_dict["by"]).agg(
+        **by_and_agg_dict["agg_dict"]
     )
     df_equals(modin_result, pandas_result)