Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX-#2254: handling dictionary functions at groupby aggregation improved #2267

Merged
merged 1 commit into from
Oct 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 31 additions & 12 deletions modin/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import pandas
import pandas.core.groupby
from pandas.core.dtypes.common import is_list_like
from pandas.core.aggregation import reconstruct_func
import pandas.core.common as com

from modin.error_message import ErrorMessage
Expand Down Expand Up @@ -301,29 +302,40 @@ def aggregate(self, func=None, *args, **kwargs):
# so we throw a different message
raise NotImplementedError("axis other than 0 is not supported")
if isinstance(func, dict) or func is None:
if func is None:
func = {}
else:
if any(i not in self._df.columns for i in func.keys()):
from pandas.core.base import SpecificationError

raise SpecificationError("nested renamer is not supported")
def _reconstruct_func(func, **kwargs):
relabeling_required, func, new_columns, order = reconstruct_func(
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
func, **kwargs
)
# We convert to the string version of the function for simplicity.
func = {
k: v
if not callable(v) or v.__name__ not in dir(self)
else v.__name__
for k, v in func.items()
}
return relabeling_required, func, new_columns, order

relabeling_required, func_dict, new_columns, order = _reconstruct_func(
func, **kwargs
)

if any(i not in self._df.columns for i in func_dict.keys()):
from pandas.core.base import SpecificationError

raise SpecificationError("nested renamer is not supported")
if isinstance(self._by, type(self._query_compiler)):
by = list(self._by.columns)
else:
by = self._by
# We convert to the string version of the function for simplicity.
func_dict = {
k: v if not callable(v) or v.__name__ not in dir(self) else v.__name__
for k, v in func.items()
}

subset_cols = list(func_dict.keys()) + (
list(self._by.columns)
if isinstance(self._by, type(self._query_compiler))
and all(c in self._df.columns for c in self._by.columns)
else []
)
return type(self._df)(
result = type(self._df)(
query_compiler=self._df[subset_cols]._query_compiler.groupby_dict_agg(
by=by,
func_dict=func_dict,
Expand All @@ -332,6 +344,13 @@ def aggregate(self, func=None, *args, **kwargs):
drop=self._drop,
)
)

if relabeling_required:
result = result.iloc[:, order]
result.columns = new_columns

return result

if is_list_like(func):
return self._default_to_pandas(
lambda df, *args, **kwargs: df.aggregate(func, *args, **kwargs),
Expand Down
1 change: 0 additions & 1 deletion modin/pandas/test/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1109,7 +1109,6 @@ def test_groupby_multiindex():
df_equals(modin_df.groupby(by=by).count(), pandas_df.groupby(by=by).count())


@pytest.mark.skip("See Modin issue #2254 for details")
def test_agg_func_None_rename():
pandas_df = pandas.DataFrame(
{
Expand Down