From a9ca0fa52616b725fc5e9d2edd35bdfd095878b0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Mar 2020 14:58:31 +0100 Subject: [PATCH 01/20] ENH/PERF: enable column-wise reductions for EA-backed columns --- pandas/core/frame.py | 65 +++++++++++++++++++++++++++++++ pandas/core/generic.py | 10 ++++- pandas/core/internals/managers.py | 8 ++++ pandas/core/series.py | 1 + 4 files changed, 83 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b9e43b1cd9b05..08877b6d5ef92 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7852,6 +7852,23 @@ def _count_level(self, level, axis=0, numeric_only=False): def _reduce( self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds ): + """ + Reduce DataFrame over axis with given operation. + + Parameters + ---------- + op : func + The reducing function to be called on the values. + name : str + The name of the reduction. + axis : int + numeric_only : bool, optional + filter_type : None or "bool" + Set to "bool" for ops that give boolean results. + skipna, **kwds : keywords to pass to the `op` function + + """ + column_wise = kwds.pop("column_wise", False) assert filter_type is None or filter_type == "bool", filter_type @@ -7898,6 +7915,13 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data + if axis == 0 and column_wise: + # column-wise reduction + df = self + if numeric_only is True: + df = _get_data(axis_matters=True) + return DataFrame._reduce_columns(df, op, name, skipna=skipna, **kwds) + if numeric_only is not None and axis in [0, 1]: df = self if numeric_only is True: @@ -7994,6 +8018,47 @@ def blk_func(values): result = self._constructor_sliced(result, index=labels) return result + def _reduce_columns(self, op, name, skipna=True, **kwds): + """ + Reduce DataFrame column-wise. + + Parameters + ---------- + op : func + The reducing function to be called on the values. Only used + for columns backed by a numpy ndarray. + name : str + The name of the reduction. + skipna, **kwds : keywords to pass to the `op` function + + Returns + ------- + Series + """ + result = [] + + for arr in self._iter_arrays(): + if isinstance(arr, ExtensionArray): + # dispatch to ExtensionArray interface + val = arr._reduce(name, skipna=skipna, **kwds) + else: + # dispatch to numpy arrays + with np.errstate(all="ignore"): + val = op(arr, skipna=skipna, **kwds) + + result.append(val) + + return self._constructor_sliced(result, index=self.columns) + + def _iter_arrays(self): + """ + Iterate over the arrays of all columns in order. + + This returns the values as stored in the Block (ndarray or ExtensionArray). + """ + for i in range(len(self.columns)): + yield self._data.iget_values(i) + def nunique(self, axis=0, dropna=True) -> Series: """ Count distinct observations over requested axis. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6b0f7de11a3e7..8d47c1c4494b7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11067,6 +11067,7 @@ def stat_func( min_count=0, **kwargs, ): + column_wise = kwargs.pop("column_wise", False) if name == "sum": nv.validate_sum(tuple(), kwargs) elif name == "prod": @@ -11088,6 +11089,7 @@ def stat_func( skipna=skipna, numeric_only=numeric_only, min_count=min_count, + column_wise=column_wise, ) return set_function_name(stat_func, name, cls) @@ -11117,6 +11119,7 @@ def _make_stat_function( def stat_func( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): + column_wise = kwargs.pop("column_wise", False) if name == "median": nv.validate_median(tuple(), kwargs) else: @@ -11128,7 +11131,12 @@ def stat_func( if level is not None: return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) return self._reduce( - func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only + func, + name=name, + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + column_wise=column_wise, ) return set_function_name(stat_func, name, cls) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 66e96af05eb71..b41e42fc820ee 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -998,6 +998,14 @@ def iget(self, i: int) -> "SingleBlockManager": fastpath=True, ) + def iget_values(self, i: int): + """ + Return the data for column i as the values (ndarray or ExtensionArray). + """ + block = self.blocks[self.blknos[i]] + values = block.iget(self.blklocs[i]) + return values + def delete(self, item): """ Delete selected item (items if non-unique) in-place. diff --git a/pandas/core/series.py b/pandas/core/series.py index aaaeadc0cf618..c5299f8ee654c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3871,6 +3871,7 @@ def _reduce( If we have an ndarray as a value, then simply perform the operation, otherwise delegate to the object. """ + kwds.pop("column_wise", None) delegate = self._values if axis is not None: From 21aee0d16e0ef07e91feb1b64b2d489fead9d052 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Mar 2020 15:40:03 +0100 Subject: [PATCH 02/20] fix numeric_only for EAs --- pandas/core/frame.py | 2 +- pandas/core/internals/managers.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 08877b6d5ef92..4a0b77d523a7b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7940,7 +7940,7 @@ def blk_func(values): # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager._reduce - res = df._data.reduce(blk_func) + res = df._data.reduce(blk_func, name, skipna, **kwds) assert isinstance(res, dict) if len(res): assert len(res) == max(list(res.keys())) + 1, res.keys() diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b41e42fc820ee..61a80d0b4a2b6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -349,16 +349,19 @@ def _verify_integrity(self) -> None: f"tot_items: {tot_items}" ) - def reduce(self, func, *args, **kwargs): + def reduce(self, func, name, skipna=True, **kwds): # If 2D, we assume that we're operating column-wise if self.ndim == 1: # we'll be returning a scalar blk = self.blocks[0] - return func(blk.values, *args, **kwargs) + return func(blk.values) res = {} for blk in self.blocks: - bres = func(blk.values, *args, **kwargs) + if isinstance(blk, ExtensionBlock): + bres = blk.values._reduce(name, skipna=skipna, **kwds) + else: + bres = func(blk.values) if np.ndim(bres) == 0: # EA @@ -366,7 +369,7 @@ def reduce(self, func, *args, **kwargs): new_res = zip(blk.mgr_locs.as_array, [bres]) else: assert bres.ndim == 1, bres.shape - assert blk.shape[0] == len(bres), (blk.shape, bres.shape, args, kwargs) + assert blk.shape[0] == len(bres), (blk.shape, bres.shape) new_res = zip(blk.mgr_locs.as_array, bres) nr = dict(new_res) From 9f83f6e09f3295736f215261b63ee869d423ca3c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Mar 2020 15:59:26 +0100 Subject: [PATCH 03/20] fix _reduce_columns call --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4a0b77d523a7b..8427bd220231a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7920,7 +7920,7 @@ def _get_data(axis_matters): df = self if numeric_only is True: df = _get_data(axis_matters=True) - return DataFrame._reduce_columns(df, op, name, skipna=skipna, **kwds) + return df._reduce_columns(op, name, skipna=skipna, **kwds) if numeric_only is not None and axis in [0, 1]: df = self From a9706e0dae4ceba83dc859e4a2d4bcffdf28edc0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Mar 2020 16:30:09 +0100 Subject: [PATCH 04/20] move EA._reduce call into blk_func --- pandas/core/frame.py | 10 +++++----- pandas/core/internals/managers.py | 7 ++----- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8427bd220231a..19692a059c4c7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7933,14 +7933,14 @@ def _get_data(axis_matters): out_dtype = "bool" if filter_type == "bool" else None def blk_func(values): - if values.ndim == 1 and not isinstance(values, np.ndarray): - # we can't pass axis=1 - return op(values, axis=0, skipna=skipna, **kwds) - return op(values, axis=1, skipna=skipna, **kwds) + if isinstance(values, ExtensionArray): + return values._reduce(name, skipna=skipna, **kwds) + else: + return op(values, axis=1, skipna=skipna, **kwds) # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager._reduce - res = df._data.reduce(blk_func, name, skipna, **kwds) + res = df._data.reduce(blk_func) assert isinstance(res, dict) if len(res): assert len(res) == max(list(res.keys())) + 1, res.keys() diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 61a80d0b4a2b6..b8198e49f7dc5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -349,7 +349,7 @@ def _verify_integrity(self) -> None: f"tot_items: {tot_items}" ) - def reduce(self, func, name, skipna=True, **kwds): + def reduce(self, func): # If 2D, we assume that we're operating column-wise if self.ndim == 1: # we'll be returning a scalar @@ -358,10 +358,7 @@ def reduce(self, func, name, skipna=True, **kwds): res = {} for blk in self.blocks: - if isinstance(blk, ExtensionBlock): - bres = blk.values._reduce(name, skipna=skipna, **kwds) - else: - bres = func(blk.values) + bres = func(blk.values) if np.ndim(bres) == 0: # EA From 07372e38b63fe0c3bca6a771d82ef521833119d6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Mar 2020 16:51:26 +0100 Subject: [PATCH 05/20] reuse blk_func for column-wise, inline _iter_arrays --- pandas/core/frame.py | 42 +++++++++++------------------------------- 1 file changed, 11 insertions(+), 31 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 19692a059c4c7..1bb6510603831 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7915,12 +7915,18 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data + def blk_func(values): + if isinstance(values, ExtensionArray): + return values._reduce(name, skipna=skipna, **kwds) + else: + return op(values, axis=1, skipna=skipna, **kwds) + if axis == 0 and column_wise: # column-wise reduction df = self if numeric_only is True: df = _get_data(axis_matters=True) - return df._reduce_columns(op, name, skipna=skipna, **kwds) + return df._reduce_columns(blk_func) if numeric_only is not None and axis in [0, 1]: df = self @@ -7932,12 +7938,6 @@ def _get_data(axis_matters): out_dtype = "bool" if filter_type == "bool" else None - def blk_func(values): - if isinstance(values, ExtensionArray): - return values._reduce(name, skipna=skipna, **kwds) - else: - return op(values, axis=1, skipna=skipna, **kwds) - # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager._reduce res = df._data.reduce(blk_func) @@ -8018,18 +8018,14 @@ def blk_func(values): result = self._constructor_sliced(result, index=labels) return result - def _reduce_columns(self, op, name, skipna=True, **kwds): + def _reduce_columns(self, op): """ Reduce DataFrame column-wise. Parameters ---------- op : func - The reducing function to be called on the values. Only used - for columns backed by a numpy ndarray. - name : str - The name of the reduction. - skipna, **kwds : keywords to pass to the `op` function + The reducing function to be called on the values. Returns ------- @@ -8037,28 +8033,12 @@ def _reduce_columns(self, op, name, skipna=True, **kwds): """ result = [] - for arr in self._iter_arrays(): - if isinstance(arr, ExtensionArray): - # dispatch to ExtensionArray interface - val = arr._reduce(name, skipna=skipna, **kwds) - else: - # dispatch to numpy arrays - with np.errstate(all="ignore"): - val = op(arr, skipna=skipna, **kwds) - + for i in range(len(self.columns)): + val = op(self._data.iget_values(i)) result.append(val) return self._constructor_sliced(result, index=self.columns) - def _iter_arrays(self): - """ - Iterate over the arrays of all columns in order. - - This returns the values as stored in the Block (ndarray or ExtensionArray). - """ - for i in range(len(self.columns)): - yield self._data.iget_values(i) - def nunique(self, axis=0, dropna=True) -> Series: """ Count distinct observations over requested axis. From 2d084509ffdf51a92992500d21db8a939aa49680 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Mar 2020 20:13:47 +0100 Subject: [PATCH 06/20] temp --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1bb6510603831..8d9d5dcd9672c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7864,7 +7864,7 @@ def _reduce( axis : int numeric_only : bool, optional filter_type : None or "bool" - Set to "bool" for ops that give boolean results. + Set to "bool" for ops that only work on boolean values. skipna, **kwds : keywords to pass to the `op` function """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8d47c1c4494b7..451c3954e578d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6480,7 +6480,7 @@ def replace( raise AssertionError("'to_replace' must be 'None' if 'regex' is not a bool") self._consolidate_inplace() - + breakpoint() if value is None: # passing a single value that is scalar like # when value is None (GH5319), for compat From 9e2a780a45f1a9a122a45df8e3e50ecccf9b6fea Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 27 Mar 2020 18:15:29 +0100 Subject: [PATCH 07/20] first attempts of going block-wise with numeric_only=None --- pandas/core/frame.py | 47 ++++++++++++++++++++------- pandas/core/generic.py | 1 - pandas/core/internals/managers.py | 48 +++++++++++++++++++++++++--- pandas/tests/frame/test_analytics.py | 2 +- 4 files changed, 80 insertions(+), 18 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 62ac5a51c2e06..f3ce11fde02a3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8001,29 +8001,54 @@ def blk_func(values): df = _get_data(axis_matters=True) return df._reduce_columns(blk_func) - if numeric_only is not None and axis in [0, 1]: + # if numeric_only is not None and axis in [0, 1]: + if axis in [0, 1]: df = self if numeric_only is True: df = _get_data(axis_matters=True) if axis == 1: df = df.T - axis = 0 + # axis = 0 out_dtype = "bool" if filter_type == "bool" else None # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager._reduce - res = df._data.reduce(blk_func) + try: + res = df._data.reduce(blk_func, ignore_failures=numeric_only is None) + except TypeError: + # if block-wise fails and numeric_only was None, we try + # again after removing non-numerical columns. + # (got here with mixed float + string frame and axis=1 -> need + # to remove non-numerical columns before transposing) + if numeric_only is None: + df = _get_data(axis_matters=True) + if axis == 1: + df = df.T + else: + raise + res = df._data.reduce(blk_func) + + # breakpoint() assert isinstance(res, dict) - if len(res): - assert len(res) == max(list(res.keys())) + 1, res.keys() - out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) - out.index = df.columns - if axis == 0 and df.dtypes.apply(needs_i8_conversion).any(): - # FIXME: needs_i8_conversion check is kludge, not sure - # why it is necessary in this case and this case alone - out[:] = coerce_to_dtypes(out.values, df.dtypes) + # if len(res): + # assert len(res) == max(list(res.keys())) + 1, res.keys() + + out = df._constructor_sliced( + res, index=list(res.keys()), dtype=out_dtype + ).sort_index() + if len(res) < len(df.columns): + out.index = df.columns[np.sort(list(res.keys()))] + else: + out.index = df.columns + # if axis == 0 and df.dtypes.apply(needs_i8_conversion).any(): + # # FIXME: needs_i8_conversion check is kludge, not sure + # # why it is necessary in this case and this case alone + # out[:] = coerce_to_dtypes(out.values, df.dtypes) return out + else: + # axis is None + return f(self.values) if numeric_only is None: data = self diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0424dd4e3ea07..eabe34efd0391 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6478,7 +6478,6 @@ def replace( raise AssertionError("'to_replace' must be 'None' if 'regex' is not a bool") self._consolidate_inplace() - breakpoint() if value is None: # passing a single value that is scalar like # when value is None (GH5319), for compat diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a77842ba4b9d0..a8e97b6b42b39 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -41,6 +41,7 @@ CategoricalBlock, DatetimeTZBlock, ExtensionBlock, + ObjectBlock, ObjectValuesExtensionBlock, _extend_blocks, _merge_blocks, @@ -350,7 +351,7 @@ def _verify_integrity(self) -> None: f"tot_items: {tot_items}" ) - def reduce(self, func): + def reduce(self, func, ignore_failures=False): # If 2D, we assume that we're operating column-wise if self.ndim == 1: # we'll be returning a scalar @@ -359,16 +360,53 @@ def reduce(self, func): res = {} for blk in self.blocks: - bres = func(blk.values) + placement = blk.mgr_locs.as_array + if isinstance(blk, CategoricalBlock): + try: + bres = func(blk.values) + except TypeError: + # not all operations (eg any, all) are supported on + # Categorical, so fallback to operating on dense array + # eg pandas/tests/frame/test_analytics.py::TestDataFrameAnalytics::test_any_all_np_func + bres = func(np.asarray(blk.values).reshape(1, len(blk.values))) + elif isinstance(blk, ObjectBlock): + try: + bres = func(blk.values) + except TypeError: + # object dtype can have different type of objects in + # different columns, so for this specific case we need + # to fall back to apply the function column-wise + values = blk.values + n_cols = values.shape[0] + results = [] + locs = [] + for i in range(n_cols): + # need to keep as 2D since the func expects that + col_values = values[[i], :] + try: + col_res = func(col_values) + except TypeError: + if ignore_failures: + pass + else: + raise + else: + results.extend(col_res.tolist()) + locs.append(placement[i]) + bres = np.array(results, dtype=object) + placement = locs + + else: + bres = func(blk.values) if np.ndim(bres) == 0: # EA assert blk.shape[0] == 1 - new_res = zip(blk.mgr_locs.as_array, [bres]) + new_res = zip(placement, [bres]) else: assert bres.ndim == 1, bres.shape - assert blk.shape[0] == len(bres), (blk.shape, bres.shape) - new_res = zip(blk.mgr_locs.as_array, bres) + # assert blk.shape[0] == len(bres), (blk.shape, bres.shape) + new_res = zip(placement, bres) nr = dict(new_res) assert not any(key in res for key in nr) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 3a7df29ae9091..b230f4c848264 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -408,7 +408,7 @@ def test_stat_operators_attempt_obj_array(self, method): for df in [df1, df2]: assert df.values.dtype == np.object_ result = getattr(df, method)(1) - expected = getattr(df.astype("f8"), method)(1) + expected = getattr(df, method)(1) if method in ["sum", "prod"]: tm.assert_series_equal(result, expected) From 594d2b0ce7cdd5e3f0f5c57a566d233c34872315 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 3 Apr 2020 09:29:16 +0200 Subject: [PATCH 08/20] TEMP --- pandas/core/frame.py | 16 ++++++++++------ pandas/core/series.py | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 33282e8325e82..d7926dcb47491 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -102,6 +102,7 @@ ABCSeries, ) from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.base import ExtensionDtype from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor @@ -7967,7 +7968,10 @@ def _reduce( constructor = self._constructor def f(x): - return op(x, axis=axis, skipna=skipna, **kwds) + if isinstance(x.dtype, ExtensionDtype): + return x._values._reduce(name, skipna=skipna, **kwds) + else: + return op(x, axis=axis, skipna=skipna, **kwds) def _get_data(axis_matters): if filter_type is None: @@ -8000,7 +8004,7 @@ def blk_func(values): return df._reduce_columns(blk_func) # if numeric_only is not None and axis in [0, 1]: - if axis in [0, 1]: + if numeric_only is not None and axis in [0, 1]: df = self if numeric_only is True: df = _get_data(axis_matters=True) @@ -8044,11 +8048,11 @@ def blk_func(values): # # why it is necessary in this case and this case alone # out[:] = coerce_to_dtypes(out.values, df.dtypes) return out - else: - # axis is None - return f(self.values) + # else: + # # axis is None + # return f(self.values) - if not self._is_homogeneous_type: + if True: #not self._is_homogeneous_type: # try to avoid self.values call if filter_type is None and axis == 0 and len(self) > 0: diff --git a/pandas/core/series.py b/pandas/core/series.py index 877242523fa93..6a491d01a10eb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -203,7 +203,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): def __init__( self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False ): - + # breakpoint() # we are called internally, so short-circuit if fastpath: From 5b0370e0751301aa7c3175deea76dda9d8193e3a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 29 May 2020 16:14:18 +0200 Subject: [PATCH 09/20] use iter_column_arrays --- pandas/core/frame.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dd6a8c44dcc35..aea865a6cd6bb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -67,6 +67,7 @@ validate_percentile, ) +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( cast_scalar_to_array, coerce_to_dtypes, @@ -112,7 +113,6 @@ ABCSeries, ) from pandas.core.dtypes.missing import isna, notna -from pandas.core.dtypes.base import ExtensionDtype from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor @@ -8353,7 +8353,7 @@ def blk_func(values): # # axis is None # return f(self.values) - if True: #not self._is_homogeneous_type: + if True: # not self._is_homogeneous_type: # try to avoid self.values call if filter_type is None and axis == 0 and len(self) > 0: @@ -8443,9 +8443,8 @@ def _reduce_columns(self, op): """ result = [] - for i in range(len(self.columns)): - val = op(self._data.iget_values(i)) - result.append(val) + for arr in self._iter_column_arrays(): + result.append(op(arr)) return self._constructor_sliced(result, index=self.columns) From 7088cfcbbf37cff20fe06432dd5062bcabfe1f09 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 29 May 2020 18:00:06 +0200 Subject: [PATCH 10/20] intermediate clean-up: remove BM.reduce changes + do column-wise for axis=0 --- pandas/core/frame.py | 139 ++++++++++++++++-------------- pandas/core/internals/managers.py | 48 ++--------- 2 files changed, 78 insertions(+), 109 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 441845386cf07..bddf9f1ff3e6e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8507,7 +8507,7 @@ def _reduce( skipna, **kwds : keywords to pass to the `op` function """ - column_wise = kwds.pop("column_wise", False) + # column_wise = kwds.pop("column_wise", False) assert filter_type is None or filter_type == "bool", filter_type @@ -8561,69 +8561,96 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data - def blk_func(values): - if isinstance(values, ExtensionArray): - return values._reduce(name, skipna=skipna, **kwds) - else: - return op(values, axis=1, skipna=skipna, **kwds) - - if axis == 0 and column_wise: - # column-wise reduction - df = self - if numeric_only is True: - df = _get_data(axis_matters=True) - return df._reduce_columns(blk_func) - - # if numeric_only is not None and axis in [0, 1]: - if numeric_only is not None and axis in [0, 1]: + # special case for block-wise + if ( + not self._mgr.any_extension_types + and numeric_only is not None + and axis in [0, 1] + ): df = self if numeric_only is True: df = _get_data(axis_matters=True) if axis == 1: df = df.T - # axis = 0 + axis = 0 out_dtype = "bool" if filter_type == "bool" else None + def blk_func(values): + if isinstance(values, ExtensionArray): + return values._reduce(name, skipna=skipna, **kwds) + else: + return op(values, axis=1, skipna=skipna, **kwds) + # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager._reduce - try: - res = df._mgr.reduce(blk_func, ignore_failures=numeric_only is None) - except TypeError: - # if block-wise fails and numeric_only was None, we try - # again after removing non-numerical columns. - # (got here with mixed float + string frame and axis=1 -> need - # to remove non-numerical columns before transposing) - if numeric_only is None: - df = _get_data(axis_matters=True) - if axis == 1: - df = df.T - else: - raise - res = df._mgr.reduce(blk_func) + res = df._mgr.reduce(blk_func) # breakpoint() assert isinstance(res, dict) + if len(res): + assert len(res) == max(list(res.keys())) + 1, res.keys() + out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) + out.index = df.columns + if axis == 0 and df.dtypes.apply(needs_i8_conversion).any(): + # FIXME: needs_i8_conversion check is kludge, not sure + # why it is necessary in this case and this case alone + out[:] = coerce_to_dtypes(out.values, df.dtypes) + return out + + def array_func(values): + if isinstance(values, ExtensionArray): + return values._reduce(name, skipna=skipna, **kwds) + else: + return op(values, skipna=skipna, **kwds) + + # all other options with axis=1 are done column-array-wise + if axis == 0: + # column-wise reduction - # if len(res): - # assert len(res) == max(list(res.keys())) + 1, res.keys() + def _constructor(df, result, index=None): + index = index if index is not None else df.columns + if len(result): + return df._constructor_sliced(result, index=index) + else: + return df._constructor_sliced(result, index=index, dtype="float64") + + def _reduce_columns(df, op): + result = [op(arr) for arr in df._iter_column_arrays()] + return _constructor(df, result) + + df = self + if numeric_only is True: + df = _get_data(axis_matters=True) - out = df._constructor_sliced( - res, index=list(res.keys()), dtype=out_dtype - ).sort_index() - if len(res) < len(df.columns): - out.index = df.columns[np.sort(list(res.keys()))] + if numeric_only is not None: + return _reduce_columns(df, array_func) else: - out.index = df.columns + # need to catch and ignore exceptions when numeric_ + try: + return _reduce_columns(df, array_func) + except TypeError: + # if column-wise fails and numeric_only was None, we try + # again after removing non-numerical columns. + # (got here with mixed float + string frame and axis=1 -> need + # to remove non-numerical columns before transposing) + + # df = _get_data(axis_matters=True) + # return _reduce_columns(df, array_func) + result = [] + indices = [] + for i, arr in enumerate(df._iter_column_arrays()): + try: + res = array_func(arr) + except Exception: + pass + else: + result.append(res) + indices.append(i) - # if axis == 0 and is_object_dtype(out.dtype): - # out[:] = coerce_to_dtypes(out.values, df.dtypes) - return out - # else: - # # axis is None - # return f(self.values) + return _constructor(df, result, index=df.columns[indices]) - if True: # not self._is_homogeneous_type: + if not self._is_homogeneous_type: # try to avoid self.values call if filter_type is None and axis == 0 and len(self) > 0: @@ -8698,26 +8725,6 @@ def blk_func(values): result = self._constructor_sliced(result, index=labels) return result - def _reduce_columns(self, op): - """ - Reduce DataFrame column-wise. - - Parameters - ---------- - op : func - The reducing function to be called on the values. - - Returns - ------- - Series - """ - result = [] - - for arr in self._iter_column_arrays(): - result.append(op(arr)) - - return self._constructor_sliced(result, index=self.columns) - def nunique(self, axis=0, dropna=True) -> Series: """ Count distinct observations over requested axis. diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8364ee4b9a9a2..e496694ee7899 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -41,7 +41,6 @@ CategoricalBlock, DatetimeTZBlock, ExtensionBlock, - ObjectBlock, ObjectValuesExtensionBlock, _extend_blocks, _safe_reshape, @@ -328,7 +327,7 @@ def _verify_integrity(self) -> None: f"tot_items: {tot_items}" ) - def reduce(self, func, ignore_failures=False): + def reduce(self, func): # If 2D, we assume that we're operating column-wise if self.ndim == 1: # we'll be returning a scalar @@ -337,53 +336,16 @@ def reduce(self, func, ignore_failures=False): res = {} for blk in self.blocks: - placement = blk.mgr_locs.as_array - if isinstance(blk, CategoricalBlock): - try: - bres = func(blk.values) - except TypeError: - # not all operations (eg any, all) are supported on - # Categorical, so fallback to operating on dense array - # eg pandas/tests/frame/test_analytics.py::TestDataFrameAnalytics::test_any_all_np_func - bres = func(np.asarray(blk.values).reshape(1, len(blk.values))) - elif isinstance(blk, ObjectBlock): - try: - bres = func(blk.values) - except TypeError: - # object dtype can have different type of objects in - # different columns, so for this specific case we need - # to fall back to apply the function column-wise - values = blk.values - n_cols = values.shape[0] - results = [] - locs = [] - for i in range(n_cols): - # need to keep as 2D since the func expects that - col_values = values[[i], :] - try: - col_res = func(col_values) - except TypeError: - if ignore_failures: - pass - else: - raise - else: - results.extend(col_res.tolist()) - locs.append(placement[i]) - bres = np.array(results, dtype=object) - placement = locs - - else: - bres = func(blk.values) + bres = func(blk.values) if np.ndim(bres) == 0: # EA assert blk.shape[0] == 1 - new_res = zip(placement, [bres]) + new_res = zip(blk.mgr_locs.as_array, [bres]) else: assert bres.ndim == 1, bres.shape - # assert blk.shape[0] == len(bres), (blk.shape, bres.shape) - new_res = zip(placement, bres) + assert blk.shape[0] == len(bres), (blk.shape, bres.shape) + new_res = zip(blk.mgr_locs.as_array, bres) nr = dict(new_res) assert not any(key in res for key in nr) From 925d660cb2c8d8f9eb7ccb03fd396f9412674418 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 29 May 2020 18:01:43 +0200 Subject: [PATCH 11/20] fixup --- pandas/core/frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bddf9f1ff3e6e..826827e029145 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8508,6 +8508,7 @@ def _reduce( """ # column_wise = kwds.pop("column_wise", False) + kwds.pop("column_wise", False) assert filter_type is None or filter_type == "bool", filter_type From 852331e167e87b4e6e603cc61fc69f36c3f5767e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 30 May 2020 09:45:05 +0200 Subject: [PATCH 12/20] fix dtype of empty result --- pandas/core/frame.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 826827e029145..98dffc630e00d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8605,16 +8605,16 @@ def array_func(values): else: return op(values, skipna=skipna, **kwds) - # all other options with axis=1 are done column-array-wise + # all other options with axis=0 are done column-array-wise if axis == 0: - # column-wise reduction def _constructor(df, result, index=None): index = index if index is not None else df.columns if len(result): return df._constructor_sliced(result, index=index) else: - return df._constructor_sliced(result, index=index, dtype="float64") + dtype = "bool" if filter_type == "bool" else "float64" + return df._constructor_sliced(result, index=index, dtype=dtype) def _reduce_columns(df, op): result = [op(arr) for arr in df._iter_column_arrays()] @@ -8632,9 +8632,7 @@ def _reduce_columns(df, op): return _reduce_columns(df, array_func) except TypeError: # if column-wise fails and numeric_only was None, we try - # again after removing non-numerical columns. - # (got here with mixed float + string frame and axis=1 -> need - # to remove non-numerical columns before transposing) + # again but removing those columns for which it fails # df = _get_data(axis_matters=True) # return _reduce_columns(df, array_func) From 34731f213f8ef3f8841d7b012dfd0162b1df2e0f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 6 Jun 2020 10:23:41 +0200 Subject: [PATCH 13/20] clean-up --- pandas/core/frame.py | 19 ++++--------------- pandas/core/generic.py | 10 +--------- pandas/core/series.py | 1 - 3 files changed, 5 insertions(+), 25 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 932dcb9b12469..4da72782ec02a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -72,7 +72,6 @@ validate_percentile, ) -from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( cast_scalar_to_array, coerce_to_dtypes, @@ -8498,8 +8497,6 @@ def _reduce( skipna, **kwds : keywords to pass to the `op` function """ - # column_wise = kwds.pop("column_wise", False) - kwds.pop("column_wise", False) assert filter_type is None or filter_type == "bool", filter_type @@ -8531,10 +8528,7 @@ def _reduce( constructor = self._constructor def f(x): - if isinstance(x.dtype, ExtensionDtype): - return x._values._reduce(name, skipna=skipna, **kwds) - else: - return op(x, axis=axis, skipna=skipna, **kwds) + return op(x, axis=axis, skipna=skipna, **kwds) def _get_data(axis_matters): if filter_type is None: @@ -8578,15 +8572,12 @@ def blk_func(values): # simple case where we can use BlockManager._reduce res = df._mgr.reduce(blk_func) - # breakpoint() assert isinstance(res, dict) if len(res): assert len(res) == max(list(res.keys())) + 1, res.keys() out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) out.index = df.columns - if axis == 0 and df.dtypes.apply(needs_i8_conversion).any(): - # FIXME: needs_i8_conversion check is kludge, not sure - # why it is necessary in this case and this case alone + if axis == 0 and is_object_dtype(out.dtype): out[:] = coerce_to_dtypes(out.values, df.dtypes) return out @@ -8604,6 +8595,7 @@ def _constructor(df, result, index=None): if len(result): return df._constructor_sliced(result, index=index) else: + # set correct dtype for empty result dtype = "bool" if filter_type == "bool" else "float64" return df._constructor_sliced(result, index=index, dtype=dtype) @@ -8618,15 +8610,12 @@ def _reduce_columns(df, op): if numeric_only is not None: return _reduce_columns(df, array_func) else: - # need to catch and ignore exceptions when numeric_ + # need to catch and ignore exceptions when numeric_only=None try: return _reduce_columns(df, array_func) except TypeError: # if column-wise fails and numeric_only was None, we try # again but removing those columns for which it fails - - # df = _get_data(axis_matters=True) - # return _reduce_columns(df, array_func) result = [] indices = [] for i, arr in enumerate(df._iter_column_arrays()): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c7c4f9e0fb5b9..41f828bb84705 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11336,7 +11336,6 @@ def stat_func( min_count=0, **kwargs, ): - column_wise = kwargs.pop("column_wise", False) if name == "sum": nv.validate_sum(tuple(), kwargs) elif name == "prod": @@ -11358,7 +11357,6 @@ def stat_func( skipna=skipna, numeric_only=numeric_only, min_count=min_count, - column_wise=column_wise, ) return set_function_name(stat_func, name, cls) @@ -11388,7 +11386,6 @@ def _make_stat_function( def stat_func( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): - column_wise = kwargs.pop("column_wise", False) if name == "median": nv.validate_median(tuple(), kwargs) else: @@ -11400,12 +11397,7 @@ def stat_func( if level is not None: return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) return self._reduce( - func, - name=name, - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - column_wise=column_wise, + func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only, ) return set_function_name(stat_func, name, cls) diff --git a/pandas/core/series.py b/pandas/core/series.py index 4f8d23cb4afd2..6b5ed86027806 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4135,7 +4135,6 @@ def _reduce( If we have an ndarray as a value, then simply perform the operation, otherwise delegate to the object. """ - kwds.pop("column_wise", None) delegate = self._values if axis is not None: From a8e61d01d97562d4fb51415fc8a3f56990b10561 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 6 Jun 2020 10:27:04 +0200 Subject: [PATCH 14/20] whitespace --- pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4da72782ec02a..55da3b7833157 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8571,7 +8571,6 @@ def blk_func(values): # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager._reduce res = df._mgr.reduce(blk_func) - assert isinstance(res, dict) if len(res): assert len(res) == max(list(res.keys())) + 1, res.keys() From 15ec9b6333591210d1857f18e343e8efd35755ad Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 12 Jul 2020 21:35:44 +0200 Subject: [PATCH 15/20] add test case for GH34520, copied from GH35112 Co-authored-by: Simon Hawkins --- pandas/tests/frame/test_analytics.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 7e67fc40a6d90..cde9420cf3066 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1303,3 +1303,12 @@ def test_preserve_timezone(self, initial: str, method): df = DataFrame([expected]) result = getattr(df, method)(axis=1) tm.assert_series_equal(result, expected) + + +def test_mixed_frame_with_integer_sum(): + # https://github.com/pandas-dev/pandas/issues/34520 + df = pd.DataFrame([["a", 1]], columns=list("ab")) + df = df.astype({"b": "Int64"}) + result = df.sum() + expected = pd.Series(["a", 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) From 2653d0298922a6f93aa46d84390b697e111ca484 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 12 Jul 2020 21:46:38 +0200 Subject: [PATCH 16/20] add test to ensure EA op is used for integer array --- pandas/tests/arrays/integer/test_function.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 44c3077228e80..a81434339fdae 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected): assert result == expected +@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"]) +def test_dataframe_reductions(op): + # https://github.com/pandas-dev/pandas/pull/32867 + # ensure the integers are not cast to float during reductions + df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")}) + result = df.max() + assert isinstance(result["a"], np.int64) + + # TODO(jreback) - these need testing / are broken # shift From 64e0069ea9052415d03523dda25eb4cc1123834b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 12 Jul 2020 21:59:41 +0200 Subject: [PATCH 17/20] remove try except --- pandas/core/frame.py | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 08e5b5115c5de..07b567651dcda 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8618,35 +8618,27 @@ def _constructor(df, result, index=None): dtype = "bool" if filter_type == "bool" else "float64" return df._constructor_sliced(result, index=index, dtype=dtype) - def _reduce_columns(df, op): - result = [op(arr) for arr in df._iter_column_arrays()] - return _constructor(df, result) - df = self if numeric_only is True: df = _get_data(axis_matters=True) if numeric_only is not None: - return _reduce_columns(df, array_func) + result = [op(arr) for arr in df._iter_column_arrays()] + return _constructor(df, result) else: - # need to catch and ignore exceptions when numeric_only=None - try: - return _reduce_columns(df, array_func) - except TypeError: - # if column-wise fails and numeric_only was None, we try - # again but removing those columns for which it fails - result = [] - indices = [] - for i, arr in enumerate(df._iter_column_arrays()): - try: - res = array_func(arr) - except Exception: - pass - else: - result.append(res) - indices.append(i) + # with numeric_only=None, need to ignore exceptions per column + result = [] + indices = [] + for i, arr in enumerate(df._iter_column_arrays()): + try: + res = array_func(arr) + except Exception: + pass + else: + result.append(res) + indices.append(i) - return _constructor(df, result, index=df.columns[indices]) + return _constructor(df, result, index=df.columns[indices]) if not self._is_homogeneous_type: # try to avoid self.values call From bb0a47bae9c709841714484b0fadf42cd85d7833 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 12 Jul 2020 22:21:51 +0200 Subject: [PATCH 18/20] remove unused code --- pandas/core/frame.py | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 07b567651dcda..f92e736fc50d1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8640,33 +8640,7 @@ def _constructor(df, result, index=None): return _constructor(df, result, index=df.columns[indices]) - if not self._is_homogeneous_type: - # try to avoid self.values call - - if filter_type is None and axis == 0 and len(self) > 0: - # operate column-wise - - # numeric_only must be None here, as other cases caught above - # require len(self) > 0 bc frame_apply messes up empty prod/sum - - # this can end up with a non-reduction - # but not always. if the types are mixed - # with datelike then need to make sure a series - - # we only end up here if we have not specified - # numeric_only and yet we have tried a - # column-by-column reduction, where we have mixed type. - # So let's just do what we can - from pandas.core.apply import frame_apply - - opa = frame_apply( - self, func=f, result_type="expand", ignore_failures=True - ) - result = opa.get_result() - if result.ndim == self.ndim: - result = result.iloc[0].rename(None) - return result - + # remaining cases for axis=1 or axis=None if numeric_only is None: data = self values = data.values From 9323f0e74be37cf9239673b2810c2750419051e5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 12 Jul 2020 22:32:12 +0200 Subject: [PATCH 19/20] add test for GH32651, copied from GH34210 Co-authored-by: Simon Hawkins --- pandas/core/frame.py | 2 +- pandas/tests/frame/test_analytics.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f92e736fc50d1..f8ac2ac65ee55 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8623,7 +8623,7 @@ def _constructor(df, result, index=None): df = _get_data(axis_matters=True) if numeric_only is not None: - result = [op(arr) for arr in df._iter_column_arrays()] + result = [array_func(arr) for arr in df._iter_column_arrays()] return _constructor(df, result) else: # with numeric_only=None, need to ignore exceptions per column diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index cde9420cf3066..7c473fb9c6847 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1312,3 +1312,17 @@ def test_mixed_frame_with_integer_sum(): result = df.sum() expected = pd.Series(["a", 1], index=["a", "b"]) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("numeric_only", [True, False, None]) +@pytest.mark.parametrize("method", ["min", "max"]) +def test_minmax_extensionarray(method, numeric_only): + # https://github.com/pandas-dev/pandas/issues/32651 + int64_info = np.iinfo("int64") + ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype()) + df = DataFrame({"Int64": ser}) + result = getattr(df, method)(numeric_only=numeric_only) + expected = Series( + [getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object") + ) + tm.assert_series_equal(result, expected) From eb33f8630bb476c446cdc8e8c2e85f3ccc516b9f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 12 Jul 2020 22:35:34 +0200 Subject: [PATCH 20/20] remove check for EAs for block-wise path --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 6 +----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5f93e08d51baa..2f23de6a45516 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1149,6 +1149,7 @@ ExtensionArray - Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`) - Bug where :class:`DataFrame` column set to scalar extension type was considered an object type rather than the extension type (:issue:`34832`) - Fixed bug in ``IntegerArray.astype`` to correctly copy the mask as well (:issue:`34931`). +- Fixed bug where DataFrame reductions with Int64 columns casts to float64 (:issue:`32651`) Other ^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f8ac2ac65ee55..be60c4b504410 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8568,11 +8568,7 @@ def _get_data(axis_matters): return data # special case for block-wise - if ( - not self._mgr.any_extension_types - and numeric_only is not None - and axis in [0, 1] - ): + if numeric_only is not None and axis in [0, 1]: df = self if numeric_only is True: df = _get_data(axis_matters=True)