diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py index 1a19bdad137..e1026427eeb 100644 --- a/asv_bench/benchmarks/benchmarks.py +++ b/asv_bench/benchmarks/benchmarks.py @@ -260,3 +260,6 @@ def time_nunique(self, impl, data_type, data_size, axis): def time_apply(self, impl, data_type, data_size, axis): self.df.apply(lambda df: df.sum(), axis=axis) + + def time_mean(self, impl, data_type, data_size, axis): + self.df.mean(axis=axis) diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index f1345818218..cbd9418f17b 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -677,33 +677,31 @@ def mean(self, axis, **kwargs): skipna = kwargs.get("skipna", True) - def map_apply_fn(ser, **kwargs): - try: - sum_result = ser.sum(skipna=skipna) - count_result = ser.count() - except TypeError: - return None - else: - return (sum_result, count_result) - - def reduce_apply_fn(ser, **kwargs): - sum_result = ser.apply(lambda x: x[0]).sum(skipna=skipna) - count_result = ser.apply(lambda x: x[1]).sum(skipna=skipna) - return sum_result / count_result + # TODO-FIX: this function may work incorrectly with user-defined "numeric" values. + # Since `count(numeric_only=True)` discards all unknown "numeric" types, we can get incorrect + # divisor inside the reduce function. + def map_fn(df, **kwargs): + result = pandas.DataFrame( + { + "sum": df.sum(axis=axis, skipna=skipna), + "count": df.count(axis=axis, numeric_only=True), + } + ) + return result if axis else result.T def reduce_fn(df, **kwargs): - df.dropna(axis=1, inplace=True, how="any") - return build_applyier(reduce_apply_fn, axis=axis)(df) - - def build_applyier(func, **applyier_kwargs): - def applyier(df, **kwargs): - result = df.apply(func, **applyier_kwargs) - return result.set_axis(df.axes[axis ^ 1], axis=0) + sum_cols = df["sum"] if axis else df.loc["sum"] + count_cols = df["count"] if axis else df.loc["count"] - return applyier + if not isinstance(sum_cols, pandas.Series): + # If we got `NaN` as the result of the sum in any axis partition, + # then we must consider the whole sum as `NaN`, so setting `skipna=False` + sum_cols = sum_cols.sum(axis=axis, skipna=False) + count_cols = count_cols.sum(axis=axis, skipna=False) + return sum_cols / count_cols return MapReduceFunction.register( - build_applyier(map_apply_fn, axis=axis, result_type="reduce"), + map_fn, reduce_fn, preserve_index=(kwargs.get("numeric_only") is not None), )(self, axis=axis, **kwargs) diff --git a/modin/pandas/test/dataframe/test_reduction.py b/modin/pandas/test/dataframe/test_reduction.py index 83ada627687..dae74e07b52 100644 --- a/modin/pandas/test/dataframe/test_reduction.py +++ b/modin/pandas/test/dataframe/test_reduction.py @@ -363,8 +363,6 @@ def test_sum_single_column(data): "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys) ) def test_reduction_specific(fn, numeric_only, axis): - if fn == "mean" and axis == 1: - pytest.skip("See issue #2313 for details") eval_general( *create_test_dfs(test_data_diff_dtype), lambda df: getattr(df, fn)(numeric_only=numeric_only, axis=axis),