diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 2b27d37904599..9dd4fb68ae26a 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -255,7 +255,7 @@ i.e., from the end of the string to the beginning of the string: s2.str.rsplit("_", expand=True, n=1) -``replace`` by default replaces `regular expressions +``replace`` optionally uses `regular expressions `__: .. ipython:: python @@ -265,35 +265,27 @@ i.e., from the end of the string to the beginning of the string: dtype="string", ) s3 - s3.str.replace("^.a|dog", "XX-XX ", case=False) + s3.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) -Some caution must be taken to keep regular expressions in mind! For example, the -following code will cause trouble because of the regular expression meaning of -``$``: - -.. ipython:: python - - # Consider the following badly formatted financial data - dollars = pd.Series(["12", "-$10", "$10,000"], dtype="string") - - # This does what you'd naively expect: - dollars.str.replace("$", "") +.. warning:: - # But this doesn't: - dollars.str.replace("-$", "-") + Some caution must be taken when dealing with regular expressions! The current behavior + is to treat single character patterns as literal strings, even when ``regex`` is set + to ``True``. This behavior is deprecated and will be removed in a future version so + that the ``regex`` keyword is always respected. - # We need to escape the special character (for >1 len patterns) - dollars.str.replace(r"-\$", "-") +.. versionchanged:: 1.2.0 -If you do want literal replacement of a string (equivalent to -:meth:`str.replace`), you can set the optional ``regex`` parameter to -``False``, rather than escaping each character. In this case both ``pat`` -and ``repl`` must be strings: +If you want literal replacement of a string (equivalent to :meth:`str.replace`), you +can set the optional ``regex`` parameter to ``False``, rather than escaping each +character. In this case both ``pat`` and ``repl`` must be strings: .. ipython:: python + dollars = pd.Series(["12", "-$10", "$10,000"], dtype="string") + # These lines are equivalent - dollars.str.replace(r"-\$", "-") + dollars.str.replace(r"-\$", "-", regex=True) dollars.str.replace("-$", "-", regex=False) The ``replace`` method can also take a callable as replacement. It is called @@ -310,7 +302,10 @@ positional argument (a regex object) and return a string. return m.group(0)[::-1] - pd.Series(["foo 123", "bar baz", np.nan], dtype="string").str.replace(pat, repl) + pd.Series( + ["foo 123", "bar baz", np.nan], + dtype="string" + ).str.replace(pat, repl, regex=True) # Using regex groups pat = r"(?P\w+) (?P\w+) (?P\w+)" @@ -320,7 +315,9 @@ positional argument (a regex object) and return a string. return m.group("two").swapcase() - pd.Series(["Foo Bar Baz", np.nan], dtype="string").str.replace(pat, repl) + pd.Series(["Foo Bar Baz", np.nan], dtype="string").str.replace( + pat, repl, regex=True + ) The ``replace`` method also accepts a compiled regular expression object from :func:`re.compile` as a pattern. All flags should be included in the @@ -331,7 +328,7 @@ compiled regular expression object. import re regex_pat = re.compile(r"^.a|dog", flags=re.IGNORECASE) - s3.str.replace(regex_pat, "XX-XX ") + s3.str.replace(regex_pat, "XX-XX ", regex=True) Including a ``flags`` argument when calling ``replace`` with a compiled regular expression object will raise a ``ValueError``. diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2f462b16ddf78..d8ae229dc354c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -287,6 +287,7 @@ Deprecations - Deprecated indexing :class:`DataFrame` rows with datetime-like strings ``df[string]``, use ``df.loc[string]`` instead (:issue:`36179`) - Deprecated casting an object-dtype index of ``datetime`` objects to :class:`DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`) - Deprecated :meth:`Index.is_all_dates` (:issue:`27744`) +- The default value of ``regex`` for :meth:`Series.str.replace` will change from ``True`` to ``False`` in a future release. In addition, single character regular expressions will *not* be treated as literal strings when ``regex=True`` is set. (:issue:`24804`) - Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`) - :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) - Deprecated slice-indexing on timezone-aware :class:`DatetimeIndex` with naive ``datetime`` objects, to match scalar indexing behavior (:issue:`36148`) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index e2161013c0166..bcdb223415813 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -483,7 +483,7 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str): var_name=j, ) newdf[j] = Categorical(newdf[j]) - newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") + newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True) # GH17627 Cast numerics suffixes to int/float newdf[j] = to_numeric(newdf[j], errors="ignore") diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index cae8cc1baf1df..df37cd47a9e7c 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1170,7 +1170,7 @@ def fullmatch(self, pat, case=True, flags=0, na=None): return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): r""" Replace each occurrence of pattern/regex in the Series/Index. @@ -1288,6 +1288,20 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): 2 NaN dtype: object """ + if regex is None: + if isinstance(pat, str) and any(c in pat for c in ".+*|^$?[](){}\\"): + # warn only in cases where regex behavior would differ from literal + msg = ( + "The default value of regex will change from True to False " + "in a future version." + ) + if len(pat) == 1: + msg += ( + " In addition, single character regular expressions will" + "*not* be treated as literal strings when regex=True." + ) + warnings.warn(msg, FutureWarning, stacklevel=3) + regex = True result = self._array._str_replace( pat, repl, n=n, case=case, flags=flags, regex=regex ) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index e255d46e81851..79d6fc22aba97 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -449,3 +449,14 @@ def test_replace_with_compiled_regex(self): result = s.replace({regex: "z"}, regex=True) expected = pd.Series(["z", "b", "c"]) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("pattern", ["^.$", "."]) + def test_str_replace_regex_default_raises_warning(self, pattern): + # https://github.com/pandas-dev/pandas/pull/24809 + s = pd.Series(["a", "b", "c"]) + msg = r"The default value of regex will change from True to False" + if len(pattern) == 1: + msg += r".*single character regular expressions.*not.*literal strings" + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: + s.str.replace(pattern, "") + assert re.match(msg, str(w[0].message)) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 6ad55639ae5d8..61df5d4d5fdd6 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -984,11 +984,11 @@ def test_casemethods(self): def test_replace(self): values = Series(["fooBAD__barBAD", np.nan]) - result = values.str.replace("BAD[_]*", "") + result = values.str.replace("BAD[_]*", "", regex=True) exp = Series(["foobar", np.nan]) tm.assert_series_equal(result, exp) - result = values.str.replace("BAD[_]*", "", n=1) + result = values.str.replace("BAD[_]*", "", n=1, regex=True) exp = Series(["foobarBAD", np.nan]) tm.assert_series_equal(result, exp) @@ -997,7 +997,7 @@ def test_replace(self): ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) - rs = Series(mixed).str.replace("BAD[_]*", "") + rs = Series(mixed).str.replace("BAD[_]*", "", regex=True) xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -1005,7 +1005,9 @@ def test_replace(self): # flags + unicode values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) - result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE) + result = values.str.replace( + r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True + ) tm.assert_series_equal(result, exp) # GH 13438 @@ -1023,7 +1025,7 @@ def test_replace_callable(self): # test with callable repl = lambda m: m.group(0).swapcase() - result = values.str.replace("[a-z][A-Z]{2}", repl, n=2) + result = values.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) exp = Series(["foObaD__baRbaD", np.nan]) tm.assert_series_equal(result, exp) @@ -1049,7 +1051,7 @@ def test_replace_callable(self): values = Series(["Foo Bar Baz", np.nan]) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - result = values.str.replace(pat, repl) + result = values.str.replace(pat, repl, regex=True) exp = Series(["bAR", np.nan]) tm.assert_series_equal(result, exp) @@ -1059,11 +1061,11 @@ def test_replace_compiled_regex(self): # test with compiled regex pat = re.compile(r"BAD[_]*") - result = values.str.replace(pat, "") + result = values.str.replace(pat, "", regex=True) exp = Series(["foobar", np.nan]) tm.assert_series_equal(result, exp) - result = values.str.replace(pat, "", n=1) + result = values.str.replace(pat, "", n=1, regex=True) exp = Series(["foobarBAD", np.nan]) tm.assert_series_equal(result, exp) @@ -1072,7 +1074,7 @@ def test_replace_compiled_regex(self): ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) - rs = Series(mixed).str.replace(pat, "") + rs = Series(mixed).str.replace(pat, "", regex=True) xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -1110,7 +1112,7 @@ def test_replace_literal(self): # GH16808 literal replace (regex=False vs regex=True) values = Series(["f.o", "foo", np.nan]) exp = Series(["bao", "bao", np.nan]) - result = values.str.replace("f.", "ba") + result = values.str.replace("f.", "ba", regex=True) tm.assert_series_equal(result, exp) exp = Series(["bao", "foo", np.nan]) @@ -3044,7 +3046,7 @@ def test_pipe_failures(self): tm.assert_series_equal(result, exp) - result = s.str.replace("|", " ") + result = s.str.replace("|", " ", regex=False) exp = Series(["A B C"]) tm.assert_series_equal(result, exp) @@ -3345,7 +3347,7 @@ def test_replace_moar(self): ) tm.assert_series_equal(result, expected) - result = s.str.replace("^.a|dog", "XX-XX ", case=False) + result = s.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ "A",