From 50c40ff1afa4a4a6772225e02c320294c422ed1a Mon Sep 17 00:00:00 2001 From: Flavien Lambert Date: Thu, 28 Feb 2019 21:55:33 +0800 Subject: [PATCH] BUG: Fix regression on DataFrame.replace for regex (#25266) * BUG: Fix regression on DataFrame.replace for regex The commit ensures that the replacement for regex is not confined to the beginning of the string but spans all the characters within. The behaviour is then consistent with versions prior to 0.24.0. One test has been added to account for character replacement when the character is not at the beginning of the string. --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/core/internals/managers.py | 12 ++++++------ pandas/tests/frame/test_replace.py | 7 +++++++ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 8f4beb3f484a4..4fcde7769b362 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -23,6 +23,7 @@ Fixed Regressions - Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`) - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`) - Fixed regression in :meth:`DataFrame.apply` causing ``RecursionError`` when ``dict``-like classes were passed as argument. (:issue:`25196`) +- Fixed regression in :meth:`DataFrame.replace` where ``regex=True`` was only replacing patterns matching the start of the string (:issue:`25259`) - Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) - Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ```Categorical`` data (:issue:`25299`) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 38b719db1709f..407db772d73e8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -552,9 +552,9 @@ def comp(s, regex=False): if isna(s): return isna(values) if hasattr(s, 'asm8'): - return _compare_or_regex_match(maybe_convert_objects(values), - getattr(s, 'asm8'), regex) - return _compare_or_regex_match(values, s, regex) + return _compare_or_regex_search(maybe_convert_objects(values), + getattr(s, 'asm8'), regex) + return _compare_or_regex_search(values, s, regex) masks = [comp(s, regex) for i, s in enumerate(src_list)] @@ -1897,11 +1897,11 @@ def _consolidate(blocks): return new_blocks -def _compare_or_regex_match(a, b, regex=False): +def _compare_or_regex_search(a, b, regex=False): """ Compare two array_like inputs of the same shape or two scalar values - Calls operator.eq or re.match, depending on regex argument. If regex is + Calls operator.eq or re.search, depending on regex argument. If regex is True, perform an element-wise regex matching. Parameters @@ -1917,7 +1917,7 @@ def _compare_or_regex_match(a, b, regex=False): if not regex: op = lambda x: operator.eq(x, b) else: - op = np.vectorize(lambda x: bool(re.match(b, x)) if isinstance(x, str) + op = np.vectorize(lambda x: bool(re.search(b, x)) if isinstance(x, str) else False) is_a_array = isinstance(a, np.ndarray) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 219f7a1585fc2..127a64da38ba3 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -466,6 +466,13 @@ def test_regex_replace_dict_nested(self): assert_frame_equal(res3, expec) assert_frame_equal(res4, expec) + def test_regex_replace_dict_nested_non_first_character(self): + # GH 25259 + df = pd.DataFrame({'first': ['abc', 'bca', 'cab']}) + expected = pd.DataFrame({'first': ['.bc', 'bc.', 'c.b']}) + result = df.replace({'a': '.'}, regex=True) + assert_frame_equal(result, expected) + def test_regex_replace_dict_nested_gh4115(self): df = pd.DataFrame({'Type': ['Q', 'T', 'Q', 'Q', 'T'], 'tmp': 2}) expected = DataFrame({'Type': [0, 1, 0, 0, 1], 'tmp': 2})