Skip to content

Commit

Permalink
DEPR: Change str.replace(regex) from True to False & single behavior (p…
Browse files Browse the repository at this point in the history
…andas-dev#49486)

* DEPR: Change str.replace(regex) from True to False & single behavior

* Add versionnchanged
  • Loading branch information
mroeschke authored and noatamir committed Nov 9, 2022
1 parent abbe952 commit 59f58e4
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 70 deletions.
14 changes: 8 additions & 6 deletions doc/source/user_guide/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -267,14 +267,16 @@ i.e., from the end of the string to the beginning of the string:
s3
s3.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
.. warning::
Some caution must be taken when dealing with regular expressions! The current behavior
is to treat single character patterns as literal strings, even when ``regex`` is set
to ``True``. This behavior is deprecated and will be removed in a future version so
that the ``regex`` keyword is always respected.
.. versionchanged:: 2.0

Single character pattern with ``regex=True`` will also be treated as regular expressions:

.. ipython:: python
.. versionchanged:: 1.2.0
s4 = pd.Series(["a.b", ".", "b", np.nan, ""], dtype="string")
s4
s4.str.replace(".", "a", regex=True)
If you want literal replacement of a string (equivalent to :meth:`str.replace`), you
can set the optional ``regex`` parameter to ``False``, rather than escaping each
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ Removal of prior version deprecations/changes
- Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)
- Changed behavior of :class:`Index`, :class:`Series`, :class:`DataFrame` constructors with floating-dtype data and a :class:`DatetimeTZDtype`, the data are now interpreted as UTC-times instead of wall-times, consistent with how integer-dtype data are treated (:issue:`45573`)
- Removed the deprecated ``base`` and ``loffset`` arguments from :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample` and :class:`pandas.Grouper`. Use ``offset`` or ``origin`` instead (:issue:`31809`)
- Change the default argument of ``regex`` for :meth:`Series.str.replace` from ``True`` to ``False``. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal. (:issue:`36695`, :issue:`24804`)
- Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`)
- Changed behavior of comparison of a :class:`Timestamp` with a ``datetime.date`` object; these now compare as un-equal and raise on inequality comparisons, matching the ``datetime.datetime`` behavior (:issue:`36131`)
- Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`)
Expand Down
28 changes: 2 additions & 26 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1323,7 +1323,7 @@ def replace(
n: int = -1,
case: bool | None = None,
flags: int = 0,
regex: bool | None = None,
regex: bool = False,
):
r"""
Replace each occurrence of pattern/regex in the Series/Index.
Expand Down Expand Up @@ -1351,16 +1351,14 @@ def replace(
flags : int, default 0 (no flags)
Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled
regex.
regex : bool, default True
regex : bool, default False
Determines if the passed-in pattern is a regular expression:
- If True, assumes the passed-in pattern is a regular expression.
- If False, treats the pattern as a literal string
- Cannot be set to False if `pat` is a compiled regex or `repl` is
a callable.
.. versionadded:: 0.23.0
Returns
-------
Series or Index of object
Expand Down Expand Up @@ -1444,20 +1442,6 @@ def replace(
2 NaN
dtype: object
"""
if regex is None:
if isinstance(pat, str) and any(c in pat for c in ".+*|^$?[](){}\\"):
# warn only in cases where regex behavior would differ from literal
msg = (
"The default value of regex will change from True to False "
"in a future version."
)
if len(pat) == 1:
msg += (
" In addition, single character regular expressions will "
"*not* be treated as literal strings when regex=True."
)
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())

# Check whether repl is valid (GH 13438, GH 15055)
if not (isinstance(repl, str) or callable(repl)):
raise TypeError("repl must be a string or callable")
Expand All @@ -1476,14 +1460,6 @@ def replace(
elif callable(repl):
raise ValueError("Cannot use a callable replacement when regex=False")

# The current behavior is to treat single character patterns as literal strings,
# even when ``regex`` is set to ``True``.
if isinstance(pat, str) and len(pat) == 1:
regex = False

if regex is None:
regex = True

if case is None:
case = True

Expand Down
53 changes: 15 additions & 38 deletions pandas/tests/strings/test_find_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ def test_replace_callable_raises(any_string_dtype, repl):
with tm.maybe_produces_warning(
PerformanceWarning, any_string_dtype == "string[pyarrow]"
):
values.str.replace("a", repl)
values.str.replace("a", repl, regex=True)


def test_replace_callable_named_groups(any_string_dtype):
Expand Down Expand Up @@ -477,7 +477,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype):
with tm.maybe_produces_warning(
PerformanceWarning, any_string_dtype == "string[pyarrow]"
):
result = ser.str.replace(pat, ", ")
result = ser.str.replace(pat, ", ", regex=True)
tm.assert_series_equal(result, expected)


Expand All @@ -490,13 +490,13 @@ def test_replace_compiled_regex_raises(any_string_dtype):
msg = "case and flags cannot be set when pat is a compiled regex"

with pytest.raises(ValueError, match=msg):
ser.str.replace(pat, "", flags=re.IGNORECASE)
ser.str.replace(pat, "", flags=re.IGNORECASE, regex=True)

with pytest.raises(ValueError, match=msg):
ser.str.replace(pat, "", case=False)
ser.str.replace(pat, "", case=False, regex=True)

with pytest.raises(ValueError, match=msg):
ser.str.replace(pat, "", case=True)
ser.str.replace(pat, "", case=True, regex=True)


def test_replace_compiled_regex_callable(any_string_dtype):
Expand All @@ -507,7 +507,7 @@ def test_replace_compiled_regex_callable(any_string_dtype):
with tm.maybe_produces_warning(
PerformanceWarning, any_string_dtype == "string[pyarrow]"
):
result = ser.str.replace(pat, repl, n=2)
result = ser.str.replace(pat, repl, n=2, regex=True)
expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -617,48 +617,25 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype):
tm.assert_series_equal(result, expected)


def test_replace_regex_default_warning(any_string_dtype):
def test_replace_regex(any_string_dtype):
# https://github.com/pandas-dev/pandas/pull/24809
s = Series(["a", "b", "ac", np.nan, ""], dtype=any_string_dtype)
msg = (
"The default value of regex will change from True to False in a "
"future version\\.$"
)

with tm.assert_produces_warning(
FutureWarning,
match=msg,
raise_on_extra_warnings=any_string_dtype != "string[pyarrow]",
):
result = s.str.replace("^.$", "a")
result = s.str.replace("^.$", "a", regex=True)
expected = Series(["a", "a", "ac", np.nan, ""], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("regex", [True, False, None])
@pytest.mark.parametrize("regex", [True, False])
def test_replace_regex_single_character(regex, any_string_dtype):
# https://github.com/pandas-dev/pandas/pull/24809

# The current behavior is to treat single character patterns as literal strings,
# even when ``regex`` is set to ``True``.

# https://github.com/pandas-dev/pandas/pull/24809, enforced in 2.0
# GH 24804
s = Series(["a.b", ".", "b", np.nan, ""], dtype=any_string_dtype)

if regex is None:
msg = re.escape(
"The default value of regex will change from True to False in a future "
"version. In addition, single character regular expressions will *not* "
"be treated as literal strings when regex=True."
)
with tm.assert_produces_warning(
FutureWarning,
match=msg,
):
result = s.str.replace(".", "a", regex=regex)
result = s.str.replace(".", "a", regex=regex)
if regex:
expected = Series(["aaa", "a", "a", np.nan, ""], dtype=any_string_dtype)
else:
result = s.str.replace(".", "a", regex=regex)

expected = Series(["aab", "a", "b", np.nan, ""], dtype=any_string_dtype)
expected = Series(["aab", "a", "b", np.nan, ""], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)


Expand Down

0 comments on commit 59f58e4

Please sign in to comment.