From 37da2c100fd09cbe8f98c78c642a978c3fae98c9 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Feb 2019 06:42:29 -0800 Subject: [PATCH] Backport PR #25266: BUG: Fix regression on DataFrame.replace for regex (#25477) --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/core/internals/managers.py | 12 ++++++------ pandas/tests/frame/test_replace.py | 7 +++++++ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 8f4beb3f484a4..4fcde7769b362 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -23,6 +23,7 @@ Fixed Regressions - Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`) - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`) - Fixed regression in :meth:`DataFrame.apply` causing ``RecursionError`` when ``dict``-like classes were passed as argument. (:issue:`25196`) +- Fixed regression in :meth:`DataFrame.replace` where ``regex=True`` was only replacing patterns matching the start of the string (:issue:`25259`) - Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) - Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ```Categorical`` data (:issue:`25299`) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 050c3d3e87fc6..5725b80990239 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -552,9 +552,9 @@ def comp(s, regex=False): if isna(s): return isna(values) if hasattr(s, 'asm8'): - return _compare_or_regex_match(maybe_convert_objects(values), - getattr(s, 'asm8'), regex) - return _compare_or_regex_match(values, s, regex) + return _compare_or_regex_search(maybe_convert_objects(values), + getattr(s, 'asm8'), regex) + return _compare_or_regex_search(values, s, regex) masks = [comp(s, regex) for i, s in enumerate(src_list)] @@ -1901,11 +1901,11 @@ def _consolidate(blocks): return new_blocks -def _compare_or_regex_match(a, b, regex=False): +def _compare_or_regex_search(a, b, regex=False): """ Compare two array_like inputs of the same shape or two scalar values - Calls operator.eq or re.match, depending on regex argument. If regex is + Calls operator.eq or re.search, depending on regex argument. If regex is True, perform an element-wise regex matching. Parameters @@ -1921,7 +1921,7 @@ def _compare_or_regex_match(a, b, regex=False): if not regex: op = lambda x: operator.eq(x, b) else: - op = np.vectorize(lambda x: bool(re.match(b, x)) if isinstance(x, str) + op = np.vectorize(lambda x: bool(re.search(b, x)) if isinstance(x, str) else False) is_a_array = isinstance(a, np.ndarray) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 219f7a1585fc2..127a64da38ba3 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -466,6 +466,13 @@ def test_regex_replace_dict_nested(self): assert_frame_equal(res3, expec) assert_frame_equal(res4, expec) + def test_regex_replace_dict_nested_non_first_character(self): + # GH 25259 + df = pd.DataFrame({'first': ['abc', 'bca', 'cab']}) + expected = pd.DataFrame({'first': ['.bc', 'bc.', 'c.b']}) + result = df.replace({'a': '.'}, regex=True) + assert_frame_equal(result, expected) + def test_regex_replace_dict_nested_gh4115(self): df = pd.DataFrame({'Type': ['Q', 'T', 'Q', 'Q', 'T'], 'tmp': 2}) expected = DataFrame({'Type': [0, 1, 0, 0, 1], 'tmp': 2})