Skip to content

Commit

Permalink
FIX-#6273: fix DataFrame.min/max/mean/median/skew/kurt with axis=None (
Browse files Browse the repository at this point in the history
…#6275)

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
  • Loading branch information
anmyachev authored Jun 16, 2023
1 parent d6c6611 commit d192e87
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 62 deletions.
4 changes: 4 additions & 0 deletions modin/core/dataframe/algebra/default2pandas/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ def applyier(df, *args, **kwargs):
and func not in ("rdivmod", pandas.Series.rdivmod)
and func not in ("to_list", pandas.Series.to_list)
and func not in ("to_dict", pandas.Series.to_dict)
and func not in ("mean", pandas.DataFrame.mean)
and func not in ("median", pandas.DataFrame.median)
and func not in ("skew", pandas.DataFrame.skew)
and func not in ("kurt", pandas.DataFrame.kurt)
):
# When applying a DatetimeProperties or TimedeltaProperties function,
# if we don't specify the dtype for the DataFrame, the frame might
Expand Down
22 changes: 18 additions & 4 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -900,7 +900,7 @@ def reduce_func(df, **kwargs):
return TreeReduce.register(map_func, reduce_func)(self, axis=axis, **kwargs)

def mean(self, axis, **kwargs):
if kwargs.get("level") is not None:
if kwargs.get("level") is not None or axis is None:
return self.default_to_pandas(pandas.DataFrame.mean, axis=axis, **kwargs)

skipna = kwargs.get("skipna", True)
Expand Down Expand Up @@ -949,10 +949,24 @@ def reduce_fn(df, **kwargs):
# Reduce operations
idxmax = Reduce.register(pandas.DataFrame.idxmax)
idxmin = Reduce.register(pandas.DataFrame.idxmin)
median = Reduce.register(pandas.DataFrame.median)

def median(self, axis, **kwargs):
if axis is None:
return self.default_to_pandas(pandas.DataFrame.median, axis=axis, **kwargs)
return Reduce.register(pandas.DataFrame.median)(self, axis=axis, **kwargs)

nunique = Reduce.register(pandas.DataFrame.nunique)
skew = Reduce.register(pandas.DataFrame.skew)
kurt = Reduce.register(pandas.DataFrame.kurt)

def skew(self, axis, **kwargs):
if axis is None:
return self.default_to_pandas(pandas.DataFrame.skew, axis=axis, **kwargs)
return Reduce.register(pandas.DataFrame.skew)(self, axis=axis, **kwargs)

def kurt(self, axis, **kwargs):
if axis is None:
return self.default_to_pandas(pandas.DataFrame.kurt, axis=axis, **kwargs)
return Reduce.register(pandas.DataFrame.kurt)(self, axis=axis, **kwargs)

sem = Reduce.register(pandas.DataFrame.sem)
std = Reduce.register(pandas.DataFrame.std)
var = Reduce.register(pandas.DataFrame.var)
Expand Down
100 changes: 58 additions & 42 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1779,22 +1779,7 @@ def iloc(self): # noqa: RT01, D200

@_inherit_docstrings(pandas.DataFrame.kurt, apilink="pandas.DataFrame.kurt")
def kurt(self, axis=0, skipna=True, numeric_only=False, **kwargs):
validate_bool_kwarg(skipna, "skipna", none_allowed=False)
axis = self._get_axis_number(axis)

if not numeric_only:
self._validate_dtypes(numeric_only=True)

data = self._get_numeric_data(axis) if numeric_only else self

return self._reduce_dimension(
data._query_compiler.kurt(
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
**kwargs,
)
)
return self._stat_operation("kurt", axis, skipna, numeric_only, **kwargs)

kurtosis = kurt

Expand Down Expand Up @@ -1867,16 +1852,60 @@ def max(
Return the maximum of the values over the requested axis.
"""
validate_bool_kwarg(skipna, "skipna", none_allowed=False)
orig_axis = axis
axis = self._get_axis_number(axis)
data = self._validate_dtypes_min_max(axis, numeric_only)
return data._reduce_dimension(
res = data._reduce_dimension(
data._query_compiler.max(
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
**kwargs,
)
)
if orig_axis is None:
res = res._reduce_dimension(
res._query_compiler.max(
axis=0,
skipna=skipna,
numeric_only=False,
**kwargs,
)
)
return res

def min(
self,
axis: Axis = 0,
skipna: bool = True,
numeric_only=False,
**kwargs,
): # noqa: PR01, RT01, D200
"""
Return the minimum of the values over the requested axis.
"""
validate_bool_kwarg(skipna, "skipna", none_allowed=False)
orig_axis = axis
axis = self._get_axis_number(axis)
data = self._validate_dtypes_min_max(axis, numeric_only)
res = data._reduce_dimension(
data._query_compiler.min(
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
**kwargs,
)
)
if orig_axis is None:
res = res._reduce_dimension(
res._query_compiler.min(
axis=0,
skipna=skipna,
numeric_only=False,
**kwargs,
)
)
return res

def _stat_operation(
self,
Expand Down Expand Up @@ -1911,7 +1940,7 @@ def _stat_operation(
self is DataFrame and level is not specified.
`DataFrame` - self is DataFrame and level is specified.
"""
axis = self._get_axis_number(axis)
axis = self._get_axis_number(axis) if axis is not None else None
validate_bool_kwarg(skipna, "skipna", none_allowed=False)
if op_name == "median":
numpy_compat.function.validate_median((), kwargs)
Expand All @@ -1924,14 +1953,23 @@ def _stat_operation(
if not numeric_only:
self._validate_dtypes(numeric_only=True)

data = self._get_numeric_data(axis) if numeric_only else self
data = (
self._get_numeric_data(axis if axis is not None else 0)
if numeric_only
else self
)
result_qc = getattr(data._query_compiler, op_name)(
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
**kwargs,
)
return self._reduce_dimension(result_qc)
return (
self._reduce_dimension(result_qc)
if isinstance(result_qc, type(self._query_compiler))
# scalar case
else result_qc
)

def memory_usage(self, index=True, deep=False): # noqa: PR01, RT01, D200
"""
Expand All @@ -1941,28 +1979,6 @@ def memory_usage(self, index=True, deep=False): # noqa: PR01, RT01, D200
self._query_compiler.memory_usage(index=index, deep=deep)
)

def min(
self,
axis: Axis = 0,
skipna: bool = True,
numeric_only=False,
**kwargs,
): # noqa: PR01, RT01, D200
"""
Return the minimum of the values over the requested axis.
"""
validate_bool_kwarg(skipna, "skipna", none_allowed=False)
axis = self._get_axis_number(axis)
data = self._validate_dtypes_min_max(axis, numeric_only)
return data._reduce_dimension(
data._query_compiler.min(
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
**kwargs,
)
)

def mod(
self, other, axis="columns", level=None, fill_value=None
): # noqa: PR01, RT01, D200
Expand Down
15 changes: 0 additions & 15 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1135,21 +1135,6 @@ def keys(self): # noqa: RT01, D200
"""
return self.index

def kurt(
self,
axis: Axis = 0,
skipna=True,
numeric_only=False,
**kwargs,
): # noqa: PR01, RT01, D200
"""
Return unbiased kurtosis over requested axis.
"""
axis = self._get_axis_number(axis)
return super(Series, self).kurt(axis, skipna, numeric_only, **kwargs)

kurtosis = kurt

def le(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, D200
"""
Return less than or equal to of series and `other`, element-wise (binary operator `le`).
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/test/dataframe/test_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ def test_sum_single_column(data):
@pytest.mark.parametrize(
"fn", ["max", "min", "median", "mean", "skew", "kurt", "sem", "std", "var"]
)
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("axis", [0, 1, None])
@pytest.mark.parametrize(
"numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys)
)
Expand Down

0 comments on commit d192e87

Please sign in to comment.