BUG: Fix regression on DataFrame.replace for regex (#25266)

* BUG: Fix regression on DataFrame.replace for regex The commit ensures that the replacement for regex is not confined to the beginning of the string but spans all the characters within. The behaviour is then consistent with versions prior to 0.24.0. One test has been added to account for character replacement when the character is not at the beginning of the string.
pandas-dev · Feb 28, 2019 · 50c40ff · 50c40ff
1 parent 64e5612
commit 50c40ff
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 6 deletions.
diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst
@@ -23,6 +23,7 @@ Fixed Regressions
 - Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`)
 - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`)
 - Fixed regression in :meth:`DataFrame.apply` causing ``RecursionError`` when ``dict``-like classes were passed as argument. (:issue:`25196`)
+- Fixed regression in :meth:`DataFrame.replace` where ``regex=True`` was only replacing patterns matching the start of the string (:issue:`25259`)
 
 - Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`)
 - Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ```Categorical`` data (:issue:`25299`)

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -552,9 +552,9 @@ def comp(s, regex=False):
             if isna(s):
                 return isna(values)
             if hasattr(s, 'asm8'):
-                return _compare_or_regex_match(maybe_convert_objects(values),
-                                               getattr(s, 'asm8'), regex)
-            return _compare_or_regex_match(values, s, regex)
+                return _compare_or_regex_search(maybe_convert_objects(values),
+                                                getattr(s, 'asm8'), regex)
+            return _compare_or_regex_search(values, s, regex)
 
         masks = [comp(s, regex) for i, s in enumerate(src_list)]
 
@@ -1897,11 +1897,11 @@ def _consolidate(blocks):
     return new_blocks
 
 
-def _compare_or_regex_match(a, b, regex=False):
+def _compare_or_regex_search(a, b, regex=False):
     """
     Compare two array_like inputs of the same shape or two scalar values
 
-    Calls operator.eq or re.match, depending on regex argument. If regex is
+    Calls operator.eq or re.search, depending on regex argument. If regex is
     True, perform an element-wise regex matching.
 
     Parameters
@@ -1917,7 +1917,7 @@ def _compare_or_regex_match(a, b, regex=False):
     if not regex:
         op = lambda x: operator.eq(x, b)
     else:
-        op = np.vectorize(lambda x: bool(re.match(b, x)) if isinstance(x, str)
+        op = np.vectorize(lambda x: bool(re.search(b, x)) if isinstance(x, str)
                           else False)
 
     is_a_array = isinstance(a, np.ndarray)

diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py
@@ -466,6 +466,13 @@ def test_regex_replace_dict_nested(self):
         assert_frame_equal(res3, expec)
         assert_frame_equal(res4, expec)
 
+    def test_regex_replace_dict_nested_non_first_character(self):
+        # GH 25259
+        df = pd.DataFrame({'first': ['abc', 'bca', 'cab']})
+        expected = pd.DataFrame({'first': ['.bc', 'bc.', 'c.b']})
+        result = df.replace({'a': '.'}, regex=True)
+        assert_frame_equal(result, expected)
+
     def test_regex_replace_dict_nested_gh4115(self):
         df = pd.DataFrame({'Type': ['Q', 'T', 'Q', 'Q', 'T'], 'tmp': 2})
         expected = DataFrame({'Type': [0, 1, 0, 0, 1], 'tmp': 2})