From 4c5eddd63e94bacddb96bf61f81a6a8fcd9c33f0 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 20 Aug 2020 21:19:10 -0700 Subject: [PATCH 01/15] REF: remove unnecesary try/except --- pandas/core/groupby/generic.py | 69 ++++++++++++++++------------------ 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 166631e69f523..51532a75d2d4a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -31,7 +31,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import FrameOrSeries, FrameOrSeriesUnion +from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -60,6 +60,7 @@ validate_func_kwargs, ) import pandas.core.algorithms as algorithms +from pandas.core.arrays import ExtensionArray from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype @@ -1034,32 +1035,31 @@ def _cython_agg_blocks( no_result = object() - def cast_result_block(result, block: "Block", how: str) -> "Block": - # see if we can cast the block to the desired dtype + def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: + # see if we can cast the values to the desired dtype # this may not be the original dtype assert not isinstance(result, DataFrame) assert result is not no_result - dtype = maybe_cast_result_dtype(block.dtype, how) + dtype = maybe_cast_result_dtype(values.dtype, how) result = maybe_downcast_numeric(result, dtype) - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical + if isinstance(values, ExtensionArray) and isinstance(result, np.ndarray): + # e.g. values was an IntegerArray + # (1, N) case can occur if values was Categorical # and result is ndarray[object] # TODO(EA2D): special casing not needed with 2D EAs assert result.ndim == 1 or result.shape[0] == 1 try: # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype + result = type(values)._from_sequence( + result.ravel(), dtype=values.dtype ) except (ValueError, TypeError): # reshape to be valid for non-Extension Block result = result.reshape(1, -1) - agg_block: "Block" = block.make_block(result) - return agg_block + return result def blk_func(block: "Block") -> List["Block"]: new_blocks: List["Block"] = [] @@ -1093,33 +1093,30 @@ def blk_func(block: "Block") -> List["Block"]: # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 sgb = get_groupby(obj, self.grouper, observed=True) - try: - result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) - except TypeError: - # we may have an exception in trying to aggregate - # continue and exclude the block - raise + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) + + result = cast(DataFrame, result) + # unwrap DataFrame to get array + if len(result._mgr.blocks) != 1: + # We've split an object block! Everything we've assumed + # about a single block input returning a single block output + # is a lie. To keep the code-path for the typical non-split case + # clean, we choose to clean up this mess later on. + assert len(locs) == result.shape[1] + for i, loc in enumerate(locs): + agg_block = result.iloc[:, [i]]._mgr.blocks[0] + agg_block.mgr_locs = [loc] + new_blocks.append(agg_block) else: - result = cast(DataFrame, result) - # unwrap DataFrame to get array - if len(result._mgr.blocks) != 1: - # We've split an object block! Everything we've assumed - # about a single block input returning a single block output - # is a lie. To keep the code-path for the typical non-split case - # clean, we choose to clean up this mess later on. - assert len(locs) == result.shape[1] - for i, loc in enumerate(locs): - agg_block = result.iloc[:, [i]]._mgr.blocks[0] - agg_block.mgr_locs = [loc] - new_blocks.append(agg_block) - else: - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - agg_block = cast_result_block(result, block, how) - new_blocks = [agg_block] + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + res_values = cast_agg_result(result, block.values, how) + agg_block = block.make_block(res_values) + new_blocks = [agg_block] else: - agg_block = cast_result_block(result, block, how) + res_values = cast_agg_result(result, block.values, how) + agg_block = block.make_block(res_values) new_blocks = [agg_block] return new_blocks From 42649fbb855a895ee5818d7dc80bdbd0ce0e9f5a Mon Sep 17 00:00:00 2001 From: Karthik Mathur <22126205+mathurk1@users.noreply.github.com> Date: Fri, 21 Aug 2020 17:34:51 -0500 Subject: [PATCH 02/15] TST: add test for agg on ordered categorical cols (#35630) --- .../tests/groupby/aggregate/test_aggregate.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ce9d4b892d775..8fe450fe6abfc 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1063,6 +1063,85 @@ def test_groupby_get_by_index(): pd.testing.assert_frame_equal(res, expected) +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}), + ({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}), + ({"nr": "min"}, {"nr": [1, 5]}), + ], +) +def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): + # test single aggregations on ordered categorical cols GHGH27800 + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg(grp_col_dict) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + expected_df = pd.DataFrame(data=exp_data, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]), + ({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]), + ({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]), + ], +) +def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): + # test combined aggregations on ordered categorical cols GH27800 + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg(grp_col_dict) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + # unpack the grp_col_dict to create the multi-index tuple + # this tuple will be used to create the expected dataframe index + multi_index_list = [] + for k, v in grp_col_dict.items(): + if isinstance(v, list): + for value in v: + multi_index_list.append([k, value]) + else: + multi_index_list.append([k, v]) + multi_index = pd.MultiIndex.from_tuples(tuple(multi_index_list)) + + expected_df = pd.DataFrame(data=exp_data, columns=multi_index, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + def test_nonagg_agg(): # GH 35490 - Single/Multiple agg of non-agg function give same results # TODO: agg should raise for functions that don't aggregate From 47121ddc1c655f428c6c3fcea8fbf02eba85600a Mon Sep 17 00:00:00 2001 From: tkmz-n <60312218+tkmz-n@users.noreply.github.com> Date: Sat, 22 Aug 2020 07:42:50 +0900 Subject: [PATCH 03/15] TST: resample does not yield empty groups (#10603) (#35799) --- pandas/tests/resample/test_timedelta.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 0fbb60c176b30..3fa85e62d028c 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -150,3 +150,18 @@ def test_resample_timedelta_edge_case(start, end, freq, resample_freq): tm.assert_index_equal(result.index, expected_index) assert result.index.freq == expected_index.freq assert not np.isnan(result[-1]) + + +def test_resample_with_timedelta_yields_no_empty_groups(): + # GH 10603 + df = pd.DataFrame( + np.random.normal(size=(10000, 4)), + index=pd.timedelta_range(start="0s", periods=10000, freq="3906250n"), + ) + result = df.loc["1s":, :].resample("3s").apply(lambda x: len(x)) + + expected = pd.DataFrame( + [[768.0] * 4] * 12 + [[528.0] * 4], + index=pd.timedelta_range(start="1s", periods=13, freq="3s"), + ) + tm.assert_frame_equal(result, expected) From 1decb3e0ee1923a29b8eded7507bcb783b3870d0 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 21 Aug 2020 18:48:02 -0700 Subject: [PATCH 04/15] revert accidental rebase --- pandas/core/groupby/generic.py | 61 ++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4b1f6cfe0a662..60e23b14eaf09 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -30,7 +30,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -59,7 +59,6 @@ validate_func_kwargs, ) import pandas.core.algorithms as algorithms -from pandas.core.arrays import ExtensionArray from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype @@ -1034,31 +1033,32 @@ def _cython_agg_blocks( no_result = object() - def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: - # see if we can cast the values to the desired dtype + def cast_result_block(result, block: "Block", how: str) -> "Block": + # see if we can cast the block to the desired dtype # this may not be the original dtype assert not isinstance(result, DataFrame) assert result is not no_result - dtype = maybe_cast_result_dtype(values.dtype, how) + dtype = maybe_cast_result_dtype(block.dtype, how) result = maybe_downcast_numeric(result, dtype) - if isinstance(values, ExtensionArray) and isinstance(result, np.ndarray): - # e.g. values was an IntegerArray - # (1, N) case can occur if values was Categorical + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical # and result is ndarray[object] # TODO(EA2D): special casing not needed with 2D EAs assert result.ndim == 1 or result.shape[0] == 1 try: # Cast back if feasible - result = type(values)._from_sequence( - result.ravel(), dtype=values.dtype + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype ) except (ValueError, TypeError): # reshape to be valid for non-Extension Block result = result.reshape(1, -1) - return result + agg_block: "Block" = block.make_block(result) + return agg_block def blk_func(block: "Block") -> List["Block"]: new_blocks: List["Block"] = [] @@ -1092,25 +1092,28 @@ def blk_func(block: "Block") -> List["Block"]: # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 sgb = get_groupby(obj, self.grouper, observed=True) - result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) - - assert isinstance(result, (Series, DataFrame)) # for mypy - # In the case of object dtype block, it may have been split - # in the operation. We un-split here. - result = result._consolidate() - assert isinstance(result, (Series, DataFrame)) # for mypy - assert len(result._mgr.blocks) == 1 - - # unwrap DataFrame to get array - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - res_values = cast_agg_result(result, block.values, how) - agg_block = block.make_block(res_values) - new_blocks = [agg_block] + try: + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) + except TypeError: + # we may have an exception in trying to aggregate + # continue and exclude the block + raise + else: + assert isinstance(result, (Series, DataFrame)) # for mypy + # In the case of object dtype block, it may have been split + # in the operation. We un-split here. + result = result._consolidate() + assert isinstance(result, (Series, DataFrame)) # for mypy + assert len(result._mgr.blocks) == 1 + + # unwrap DataFrame to get array + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + agg_block = cast_result_block(result, block, how) + new_blocks = [agg_block] else: - res_values = cast_agg_result(result, block.values, how) - agg_block = block.make_block(res_values) + agg_block = cast_result_block(result, block, how) new_blocks = [agg_block] return new_blocks From 5281ce77b0229de68d05dd3e24054b3e6f9206b0 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 23 Aug 2020 19:54:14 -0700 Subject: [PATCH 05/15] REF: implement Block.reduce --- pandas/core/frame.py | 11 +++++------ pandas/core/internals/blocks.py | 15 +++++++++++++++ pandas/core/internals/managers.py | 29 ++++++++--------------------- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 837bd35414773..148c0ed59a80c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8647,13 +8647,12 @@ def blk_func(values): return op(values, axis=1, skipna=skipna, **kwds) # After possibly _get_data and transposing, we are now in the - # simple case where we can use BlockManager._reduce + # simple case where we can use BlockManager.reduce res = df._mgr.reduce(blk_func) - assert isinstance(res, dict) - if len(res): - assert len(res) == max(list(res.keys())) + 1, res.keys() - out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) - out.index = df.columns + out = df._constructor(res,).iloc[0].rename(None) + if out_dtype is not None: + # only astype if result is empty + out = out.astype(out_dtype) if axis == 0 and is_object_dtype(out.dtype): out[:] = coerce_to_dtypes(out.values, df.dtypes) return out diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f3286b3c20965..c62be4f767f00 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -346,6 +346,21 @@ def apply(self, func, **kwargs) -> List["Block"]: return self._split_op_result(result) + def reduce(self, func) -> List["Block"]: + # We will apply the function and reshape the result into a single-row + # Block with the same mgr_locs; squeezing will be done at a higher level + assert self.ndim == 2 + + result = func(self.values) + if np.ndim(result) == 0: + # TODO(EA2D): special case not needed with 2D EAs + res_values = np.array([[result]]) + else: + res_values = result.reshape(-1, 1) + + nb = self.make_block(res_values) + return [nb] + def _split_op_result(self, result) -> List["Block"]: # See also: split_and_operate if is_extension_array_dtype(result) and result.ndim > 1: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f05d4cf1c4be6..297ad3077ef1d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -330,31 +330,18 @@ def _verify_integrity(self) -> None: f"tot_items: {tot_items}" ) - def reduce(self, func): + def reduce(self: T, func) -> T: # If 2D, we assume that we're operating column-wise - if self.ndim == 1: - # we'll be returning a scalar - blk = self.blocks[0] - return func(blk.values) + assert self.ndim == 2 - res = {} + res_blocks = [] for blk in self.blocks: - bres = func(blk.values) - - if np.ndim(bres) == 0: - # EA - assert blk.shape[0] == 1 - new_res = zip(blk.mgr_locs.as_array, [bres]) - else: - assert bres.ndim == 1, bres.shape - assert blk.shape[0] == len(bres), (blk.shape, bres.shape) - new_res = zip(blk.mgr_locs.as_array, bres) - - nr = dict(new_res) - assert not any(key in res for key in nr) - res.update(nr) + nbs = blk.reduce(func) + res_blocks.extend(nbs) - return res + index = Index([0]) # placeholder + new_mgr = BlockManager.from_blocks(res_blocks, [self.items, index]) + return new_mgr def operate_blockwise(self, other: "BlockManager", array_op) -> "BlockManager": """ From cdcc1a0f0e4f40e02a01c0fd71e8905bf1f07364 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 23 Aug 2020 19:56:54 -0700 Subject: [PATCH 06/15] remove outdated comment --- pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 148c0ed59a80c..606bd4cc3b52d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8651,7 +8651,6 @@ def blk_func(values): res = df._mgr.reduce(blk_func) out = df._constructor(res,).iloc[0].rename(None) if out_dtype is not None: - # only astype if result is empty out = out.astype(out_dtype) if axis == 0 and is_object_dtype(out.dtype): out[:] = coerce_to_dtypes(out.values, df.dtypes) From 356cdf7a61ded4a5f0c513c27094a7e9e594f1b4 Mon Sep 17 00:00:00 2001 From: guru kiran <47276342+gurukiran07@users.noreply.github.com> Date: Mon, 24 Aug 2020 21:06:17 +0530 Subject: [PATCH 07/15] DOC: Updated aggregate docstring (#35042) * DOC: Updated aggregate docstring * Doc: updated aggregate docstring * Update pandas/core/generic.py Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> * Update generic.py * Update generic.py * Revert "Update generic.py" This reverts commit 15ecaf724e98c5bcb2d459e720ca60e22e758346. * Revert "Revert "Update generic.py"" This reverts commit cc231c80a7a0bf9a18f3d4342e156d9d95c5ac8b. * Updated docstring of agg * Trailing whitespace removed * DOC: Updated docstring of agg * Update generic.py * Updated Docstring Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- pandas/core/generic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fe412bc0ce937..9f36405bf6428 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5169,6 +5169,9 @@ def pipe(self, func, *args, **kwargs): ----- `agg` is an alias for `aggregate`. Use the alias. + In pandas, agg, as most operations just ignores the missing values, + and returns the operation only considering the values that are present. + A passed user-defined-function will be passed a Series for evaluation. {examples}""" ) From e29283b04f680e325fe2a577f44bf2db8750076f Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 24 Aug 2020 14:01:06 -0700 Subject: [PATCH 08/15] REF: BlockManager.reduce with ignore_failures --- pandas/core/frame.py | 32 +++++++++++++++++++++++++++++-- pandas/core/internals/managers.py | 31 ++++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 606bd4cc3b52d..43610a57dce50 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8598,6 +8598,8 @@ def _reduce( cols = self.columns[~dtype_is_dt] self = self[cols] + any_object = self.dtypes.apply(is_object_dtype).any() + if axis is None and filter_type == "bool": labels = None constructor = None @@ -8648,14 +8650,40 @@ def blk_func(values): # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager.reduce - res = df._mgr.reduce(blk_func) - out = df._constructor(res,).iloc[0].rename(None) + res, _ = df._mgr.reduce(blk_func) + out = df._constructor(res).iloc[0].rename(None) if out_dtype is not None: out = out.astype(out_dtype) if axis == 0 and is_object_dtype(out.dtype): out[:] = coerce_to_dtypes(out.values, df.dtypes) return out + elif numeric_only is None and axis == 0 and not any_object: + # axis==1 is tricky, handled separately + # object dtypes need to go through column-wise path + df = self + if axis == 1: + df = df.T + + def blk_func(values): + if isinstance(values, ExtensionArray): + return values._reduce(name, skipna=skipna, **kwds) + else: + return op(values, axis=1, skipna=skipna, **kwds) + + res, indexer = df._mgr.reduce(blk_func, ignore_failures=True) + out = df._constructor(res).iloc[0].rename(None) + + out_dtype = "bool" if filter_type == "bool" else None + if out_dtype is not None: + out = out.astype(out_dtype) + + if axis == 0 and is_object_dtype(out.dtype): + # GH#35865 careful to cast explicitly to object + nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[indexer]) + out[:] = np.array(nvs, dtype=object) + return out + if not self._is_homogeneous_type or self._mgr.any_extension_types: # try to avoid self.values call diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 297ad3077ef1d..fdf19083dbe03 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -330,18 +330,31 @@ def _verify_integrity(self) -> None: f"tot_items: {tot_items}" ) - def reduce(self: T, func) -> T: + def reduce(self: T, func, ignore_failures: bool = False) -> Tuple[T, np.ndarray]: # If 2D, we assume that we're operating column-wise assert self.ndim == 2 - res_blocks = [] - for blk in self.blocks: - nbs = blk.reduce(func) + res_blocks: List[Block] = [] + skipped: List[int] = [] + for i, blk in enumerate(self.blocks): + try: + nbs = blk.reduce(func) + except TypeError: + if ignore_failures: + skipped.append(i) + continue + raise res_blocks.extend(nbs) + if res_blocks: + indexer = np.concatenate([blk.mgr_locs.as_array for blk in res_blocks]) + else: + indexer = [] + + new_items = self.reset_dropped_locs(res_blocks, skipped) index = Index([0]) # placeholder - new_mgr = BlockManager.from_blocks(res_blocks, [self.items, index]) - return new_mgr + new_mgr = BlockManager.from_blocks(res_blocks, [new_items, index]) + return new_mgr, indexer def operate_blockwise(self, other: "BlockManager", array_op) -> "BlockManager": """ @@ -1499,6 +1512,12 @@ def reset_dropped_locs(self, blocks: List[Block], skipped: List[int]) -> Index: ----- Alters each block's mgr_locs inplace. """ + if not skipped: + return self.items.copy() + elif not blocks: + # empty index with same dtype and name + return self.items[:0] + ncols = len(self) new_locs = [blk.mgr_locs.as_array for blk in blocks] From a58fdf06b4f89b81a1192d48254fe62f768815d8 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 24 Aug 2020 17:53:11 -0700 Subject: [PATCH 09/15] de-duplicate --- pandas/core/frame.py | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 43610a57dce50..ffb50a8f22215 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8632,7 +8632,9 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data - if numeric_only is not None and axis in [0, 1]: + if (numeric_only is not None and axis in [0, 1]) or ( + numeric_only is None and axis == 0 and not any_object + ): df = self if numeric_only is True: df = _get_data(axis_matters=True) @@ -8641,6 +8643,7 @@ def _get_data(axis_matters): axis = 0 out_dtype = "bool" if filter_type == "bool" else None + ignore_failures = numeric_only is None def blk_func(values): if isinstance(values, ExtensionArray): @@ -8650,37 +8653,13 @@ def blk_func(values): # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager.reduce - res, _ = df._mgr.reduce(blk_func) - out = df._constructor(res).iloc[0].rename(None) - if out_dtype is not None: - out = out.astype(out_dtype) - if axis == 0 and is_object_dtype(out.dtype): - out[:] = coerce_to_dtypes(out.values, df.dtypes) - return out - - elif numeric_only is None and axis == 0 and not any_object: - # axis==1 is tricky, handled separately - # object dtypes need to go through column-wise path - df = self - if axis == 1: - df = df.T - - def blk_func(values): - if isinstance(values, ExtensionArray): - return values._reduce(name, skipna=skipna, **kwds) - else: - return op(values, axis=1, skipna=skipna, **kwds) - - res, indexer = df._mgr.reduce(blk_func, ignore_failures=True) + res, indexer = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) out = df._constructor(res).iloc[0].rename(None) - - out_dtype = "bool" if filter_type == "bool" else None if out_dtype is not None: out = out.astype(out_dtype) - if axis == 0 and is_object_dtype(out.dtype): # GH#35865 careful to cast explicitly to object - nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[indexer]) + nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[np.sort(indexer)]) out[:] = np.array(nvs, dtype=object) return out From cdad23d02d08196c3e9bd0d1c41dce3a4bb6189f Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 25 Aug 2020 09:34:35 -0700 Subject: [PATCH 10/15] mypy fixup --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4dd4aa1048d8c..cd7a9d66659fc 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -368,7 +368,7 @@ def reduce( new_mgr = self._combine(res_blocks, copy=False, index=index) else: indexer = [] - new_mgr = BlockManager.from_blocks([], [Index([]), index]) + new_mgr = type(self).from_blocks([], [Index([]), index]) return new_mgr, indexer From 028a0b7757e66cdf1077fa61cb43f2a5e1497ec3 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 25 Aug 2020 10:00:53 -0700 Subject: [PATCH 11/15] mypy fixup --- pandas/core/internals/managers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cd7a9d66659fc..f22ba29a23a41 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -751,8 +751,8 @@ def get_numeric_data(self, copy: bool = False) -> "BlockManager": return self._combine([b for b in self.blocks if b.is_numeric], copy) def _combine( - self, blocks: List[Block], copy: bool = True, index: Optional[Index] = None - ) -> "BlockManager": + self: T, blocks: List[Block], copy: bool = True, index: Optional[Index] = None + ) -> T: """ return a new manager with the blocks """ if len(blocks) == 0: return self.make_empty() From 2f10b724f0c20f6e9d7842742e28a7acd7a49126 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 27 Aug 2020 07:39:30 -0700 Subject: [PATCH 12/15] comment --- pandas/core/frame.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b145990ebcec4..5cbee6c1a6dac 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8635,6 +8635,10 @@ def _get_data(axis_matters): if (numeric_only is not None and axis in [0, 1]) or ( numeric_only is None and axis == 0 and not any_object ): + # For numeric_only non-None and axis non-None, we know + # which blocks to use and no try/except is needed. + # For numeric_only=None only the case with axis==0 and no object + # dtypes are unambiguous can be handled with BlockManager.reduce df = self if numeric_only is True: df = _get_data(axis_matters=True) From 23e3a6aa0e2671e0d0872c6099894ffdd1ebba6d Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Sep 2020 20:32:34 -0700 Subject: [PATCH 13/15] update tested behavior --- pandas/tests/frame/test_analytics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b0ba0d991c9b0..bf6e500a17b89 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1108,10 +1108,10 @@ def test_any_all_bool_only(self): True, marks=[td.skip_if_np_lt("1.15")], ), - (np.all, {"A": pd.Series([0, 1], dtype="category")}, False), - (np.any, {"A": pd.Series([0, 1], dtype="category")}, True), + (np.all, {"A": pd.Series([0, 1], dtype="category")}, True), + (np.any, {"A": pd.Series([0, 1], dtype="category")}, False), (np.all, {"A": pd.Series([1, 2], dtype="category")}, True), - (np.any, {"A": pd.Series([1, 2], dtype="category")}, True), + (np.any, {"A": pd.Series([1, 2], dtype="category")}, False), # Mix GH#21484 pytest.param( np.all, From ccea5a52cf18e67875cb4d74e5b9e3bc2a073ee6 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Sep 2020 10:26:43 -0700 Subject: [PATCH 14/15] whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9a778acba4764..d261789a8f186 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -244,7 +244,7 @@ Timezones Numeric ^^^^^^^ -- +- Bug in :class:`DataFrame` reductions incorrectly ignoring ``ExtensionArray`` behaviors (:issue:`35881`) - Conversion From f349ef75d3faef93b0f2a5892769786527a0e462 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 6 Oct 2020 14:32:39 -0700 Subject: [PATCH 15/15] Revert behavior-changing component --- doc/source/whatsnew/v1.2.0.rst | 1 - pandas/core/frame.py | 6 +++++- pandas/tests/frame/test_analytics.py | 6 +++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9f99612509abb..dc65ed238799a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -336,7 +336,6 @@ Numeric - Bug in :class:`Series` where two :class:`Series` each have a :class:`DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) - Bug in :meth:`pd._testing.assert_almost_equal` was incorrect for complex numeric types (:issue:`28235`) - Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`) -- Bug in :class:`DataFrame` reductions with a single :class:`ExtensionArray` column such where the reduction would raise ``TypeError`` on the array incorrectly casting to a ``NumPy`` array and sometimes giving incorrect results (:issue:`35881`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index eb90ff15c7147..8efe2fc090fc5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8623,12 +8623,16 @@ def _get_data() -> DataFrame: return data if numeric_only is not None or ( - numeric_only is None and axis == 0 and not any_object + numeric_only is None + and axis == 0 + and not any_object + and not self._mgr.any_extension_types ): # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. # For numeric_only=None only the case with axis==0 and no object # dtypes are unambiguous can be handled with BlockManager.reduce + # Case with EAs see GH#35881 df = self if numeric_only is True: df = _get_data() diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 6cb90b474947d..ee136533b0775 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1068,10 +1068,10 @@ def test_any_all_bool_only(self): pytest.param(np.any, {"A": pd.Series([0, 1], dtype="m8[ns]")}, True), pytest.param(np.all, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True), pytest.param(np.any, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True), - (np.all, {"A": pd.Series([0, 1], dtype="category")}, True), - (np.any, {"A": pd.Series([0, 1], dtype="category")}, False), + (np.all, {"A": pd.Series([0, 1], dtype="category")}, False), + (np.any, {"A": pd.Series([0, 1], dtype="category")}, True), (np.all, {"A": pd.Series([1, 2], dtype="category")}, True), - (np.any, {"A": pd.Series([1, 2], dtype="category")}, False), + (np.any, {"A": pd.Series([1, 2], dtype="category")}, True), # Mix GH#21484 pytest.param( np.all,