From 19d6a61f5e627ec0750b3411a90d7bb65b0e5ee7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 8 Nov 2020 16:37:00 -0800 Subject: [PATCH] REF: simplify NDFrame.replace, ObjectBlock.replace (#37704) --- pandas/core/generic.py | 34 ++++++------- pandas/core/internals/blocks.py | 48 +++++++------------ .../tests/arrays/categorical/test_replace.py | 3 +- 3 files changed, 37 insertions(+), 48 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bea650c1b50fd..02fa7308e7ee8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6744,25 +6744,25 @@ def replace( else: raise TypeError("value argument must be scalar, dict, or Series") - elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] - if is_list_like(value): - if len(to_replace) != len(value): - raise ValueError( - f"Replacement lists must match in length. " - f"Expecting {len(to_replace)} got {len(value)} " - ) - self._consolidate_inplace() - new_data = self._mgr.replace_list( - src_list=to_replace, - dest_list=value, - inplace=inplace, - regex=regex, + elif is_list_like(to_replace): + if not is_list_like(value): + # e.g. to_replace = [NA, ''] and value is 0, + # so we replace NA with 0 and then replace '' with 0 + value = [value] * len(to_replace) + + # e.g. we have to_replace = [NA, ''] and value = [0, 'missing'] + if len(to_replace) != len(value): + raise ValueError( + f"Replacement lists must match in length. " + f"Expecting {len(to_replace)} got {len(value)} " ) + new_data = self._mgr.replace_list( + src_list=to_replace, + dest_list=value, + inplace=inplace, + regex=regex, + ) - else: # [NA, ''] -> 0 - new_data = self._mgr.replace( - to_replace=to_replace, value=value, inplace=inplace, regex=regex - ) elif to_replace is None: if not ( is_re_compilable(regex) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8e01aaa396265..9e6480dd709f0 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2502,39 +2502,14 @@ def replace( inplace: bool = False, regex: bool = False, ) -> List["Block"]: - to_rep_is_list = is_list_like(to_replace) - value_is_list = is_list_like(value) - both_lists = to_rep_is_list and value_is_list - either_list = to_rep_is_list or value_is_list + # Note: the checks we do in NDFrame.replace ensure we never get + # here with listlike to_replace or value, as those cases + # go through _replace_list - result_blocks: List["Block"] = [] - blocks: List["Block"] = [self] - - if not either_list and is_re(to_replace): + if is_re(to_replace) or regex: return self._replace_single(to_replace, value, inplace=inplace, regex=True) - elif not (either_list or regex): + else: return super().replace(to_replace, value, inplace=inplace, regex=regex) - elif both_lists: - for to_rep, v in zip(to_replace, value): - result_blocks = [] - for b in blocks: - result = b._replace_single(to_rep, v, inplace=inplace, regex=regex) - result_blocks.extend(result) - blocks = result_blocks - return result_blocks - - elif to_rep_is_list and regex: - for to_rep in to_replace: - result_blocks = [] - for b in blocks: - result = b._replace_single( - to_rep, value, inplace=inplace, regex=regex - ) - result_blocks.extend(result) - blocks = result_blocks - return result_blocks - - return self._replace_single(to_replace, value, inplace=inplace, regex=regex) def _replace_single( self, @@ -2627,6 +2602,19 @@ def re_replacer(s): class CategoricalBlock(ExtensionBlock): __slots__ = () + def _replace_list( + self, + src_list: List[Any], + dest_list: List[Any], + inplace: bool = False, + regex: bool = False, + ) -> List["Block"]: + if len(algos.unique(dest_list)) == 1: + # We got likely here by tiling value inside NDFrame.replace, + # so un-tile here + return self.replace(src_list, dest_list[0], inplace, regex) + return super()._replace_list(src_list, dest_list, inplace, regex) + def replace( self, to_replace, diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 5889195ad68db..007c4bdea17f8 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -21,6 +21,7 @@ ((1, 2, 4), 5, [5, 5, 3], False), ((5, 6), 2, [1, 2, 3], False), # many-to-many, handled outside of Categorical and results in separate dtype + # except for cases with only 1 unique entry in `value` ([1], [2], [2, 2, 3], True), ([1, 4], [5, 2], [5, 2, 3], True), # check_categorical sorts categories, which crashes on mixed dtypes @@ -30,7 +31,7 @@ ) def test_replace(to_replace, value, expected, flip_categories): # GH 31720 - stays_categorical = not isinstance(value, list) + stays_categorical = not isinstance(value, list) or len(pd.unique(value)) == 1 s = pd.Series([1, 2, 3], dtype="category") result = s.replace(to_replace, value)