diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5f93e08d51baa..2f23de6a45516 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1149,6 +1149,7 @@ ExtensionArray - Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`) - Bug where :class:`DataFrame` column set to scalar extension type was considered an object type rather than the extension type (:issue:`34832`) - Fixed bug in ``IntegerArray.astype`` to correctly copy the mask as well (:issue:`34931`). +- Fixed bug where DataFrame reductions with Int64 columns casts to float64 (:issue:`32651`) Other ^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cfe5621fec14e..be60c4b504410 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8501,6 +8501,22 @@ def _count_level(self, level, axis=0, numeric_only=False): def _reduce( self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds ): + """ + Reduce DataFrame over axis with given operation. + + Parameters + ---------- + op : func + The reducing function to be called on the values. + name : str + The name of the reduction. + axis : int + numeric_only : bool, optional + filter_type : None or "bool" + Set to "bool" for ops that only work on boolean values. + skipna, **kwds : keywords to pass to the `op` function + + """ assert filter_type is None or filter_type == "bool", filter_type @@ -8551,6 +8567,7 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data + # special case for block-wise if numeric_only is not None and axis in [0, 1]: df = self if numeric_only is True: @@ -8579,33 +8596,47 @@ def blk_func(values): out[:] = coerce_to_dtypes(out.values, df.dtypes) return out - if not self._is_homogeneous_type: - # try to avoid self.values call + def array_func(values): + if isinstance(values, ExtensionArray): + return values._reduce(name, skipna=skipna, **kwds) + else: + return op(values, skipna=skipna, **kwds) - if filter_type is None and axis == 0 and len(self) > 0: - # operate column-wise + # all other options with axis=0 are done column-array-wise + if axis == 0: - # numeric_only must be None here, as other cases caught above - # require len(self) > 0 bc frame_apply messes up empty prod/sum + def _constructor(df, result, index=None): + index = index if index is not None else df.columns + if len(result): + return df._constructor_sliced(result, index=index) + else: + # set correct dtype for empty result + dtype = "bool" if filter_type == "bool" else "float64" + return df._constructor_sliced(result, index=index, dtype=dtype) - # this can end up with a non-reduction - # but not always. if the types are mixed - # with datelike then need to make sure a series + df = self + if numeric_only is True: + df = _get_data(axis_matters=True) - # we only end up here if we have not specified - # numeric_only and yet we have tried a - # column-by-column reduction, where we have mixed type. - # So let's just do what we can - from pandas.core.apply import frame_apply + if numeric_only is not None: + result = [array_func(arr) for arr in df._iter_column_arrays()] + return _constructor(df, result) + else: + # with numeric_only=None, need to ignore exceptions per column + result = [] + indices = [] + for i, arr in enumerate(df._iter_column_arrays()): + try: + res = array_func(arr) + except Exception: + pass + else: + result.append(res) + indices.append(i) - opa = frame_apply( - self, func=f, result_type="expand", ignore_failures=True - ) - result = opa.get_result() - if result.ndim == self.ndim: - result = result.iloc[0].rename(None) - return result + return _constructor(df, result, index=df.columns[indices]) + # remaining cases for axis=1 or axis=None if numeric_only is None: data = self values = data.values diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 571fcc67f3bb5..f6badf7ec9139 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11402,7 +11402,7 @@ def stat_func( if level is not None: return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) return self._reduce( - func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only + func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only, ) return set_function_name(stat_func, name, cls) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 44c3077228e80..a81434339fdae 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected): assert result == expected +@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"]) +def test_dataframe_reductions(op): + # https://github.com/pandas-dev/pandas/pull/32867 + # ensure the integers are not cast to float during reductions + df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")}) + result = df.max() + assert isinstance(result["a"], np.int64) + + # TODO(jreback) - these need testing / are broken # shift diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index db8bb5ca3c437..7c473fb9c6847 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -415,7 +415,7 @@ def test_stat_operators_attempt_obj_array(self, method): for df in [df1, df2]: assert df.values.dtype == np.object_ result = getattr(df, method)(1) - expected = getattr(df.astype("f8"), method)(1) + expected = getattr(df, method)(1) if method in ["sum", "prod"]: tm.assert_series_equal(result, expected) @@ -1303,3 +1303,26 @@ def test_preserve_timezone(self, initial: str, method): df = DataFrame([expected]) result = getattr(df, method)(axis=1) tm.assert_series_equal(result, expected) + + +def test_mixed_frame_with_integer_sum(): + # https://github.com/pandas-dev/pandas/issues/34520 + df = pd.DataFrame([["a", 1]], columns=list("ab")) + df = df.astype({"b": "Int64"}) + result = df.sum() + expected = pd.Series(["a", 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("numeric_only", [True, False, None]) +@pytest.mark.parametrize("method", ["min", "max"]) +def test_minmax_extensionarray(method, numeric_only): + # https://github.com/pandas-dev/pandas/issues/32651 + int64_info = np.iinfo("int64") + ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype()) + df = DataFrame({"Int64": ser}) + result = getattr(df, method)(numeric_only=numeric_only) + expected = Series( + [getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object") + ) + tm.assert_series_equal(result, expected)