Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Allow mapping as syntactic sugar in str.replace_many #18214

Merged
merged 10 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions py-polars/polars/api.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from functools import reduce
from operator import or_
from typing import TYPE_CHECKING, Callable, Generic, TypeVar
from warnings import warn

Expand All @@ -20,9 +18,8 @@
]

# do not allow override of polars' own namespaces (as registered by '_accessors')
_reserved_namespaces: set[str] = reduce(
or_,
(cls._accessors for cls in (pl.DataFrame, pl.Expr, pl.LazyFrame, pl.Series)),
_reserved_namespaces: set[str] = set.union(
*(cls._accessors for cls in (pl.DataFrame, pl.Expr, pl.LazyFrame, pl.Series))
)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Drive by: intent is clearer IMO and removes the need for two imports



Expand Down
4 changes: 0 additions & 4 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6823,10 +6823,6 @@ def join(
Note that joining on any other expressions than `col`
will turn off coalescing.

Returns
-------
DataFrame

Comment on lines -6826 to -6829
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Drive by: Returns section is not consistent with other DataFrame methods and LazyFrame.join docstring. Is also already covered by type hint.

See Also
--------
join_asof
Expand Down
121 changes: 100 additions & 21 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import warnings
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Mapping

import polars._reexport as pl
from polars import functions as F
Expand All @@ -11,7 +11,7 @@
)
from polars._utils.parse import parse_into_expression
from polars._utils.unstable import unstable
from polars._utils.various import find_stacklevel
from polars._utils.various import find_stacklevel, no_default
from polars._utils.wrap import wrap_expr
from polars.datatypes import Date, Datetime, Time, parse_into_dtype
from polars.datatypes.constants import N_INFER_DEFAULT
Expand All @@ -28,6 +28,7 @@
TimeUnit,
TransferEncoding,
)
from polars._utils.various import NoDefault


class ExprStringNameSpace:
Expand Down Expand Up @@ -2400,9 +2401,9 @@ def contains_any(
self, patterns: IntoExpr, *, ascii_case_insensitive: bool = False
) -> Expr:
"""
Use the aho-corasick algorithm to find matches.
Use the Aho-Corasick algorithm to find matches.

This version determines if any of the patterns find a match.
Determines if any of the patterns are contained in the string.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not clear what is being compared against "this version". Also updated to be a bit more explicit.


Parameters
----------
Expand All @@ -2413,6 +2414,11 @@ def contains_any(
When this option is enabled, searching will be performed without respect
to case for ASCII letters (a-z and A-Z) only.

Notes
-----
This method supports matching on string literals only, and does not support
regular expression matching.

Examples
--------
>>> _ = pl.Config.set_fmt_str_lengths(100)
Expand Down Expand Up @@ -2448,29 +2454,75 @@ def contains_any(

def replace_many(
self,
patterns: IntoExpr,
replace_with: IntoExpr,
patterns: IntoExpr | Mapping[str, str],
replace_with: IntoExpr | NoDefault = no_default,
*,
ascii_case_insensitive: bool = False,
) -> Expr:
"""

Use the aho-corasick algorithm to replace many matches.
Use the Aho-Corasick algorithm to replace many matches.

Parameters
----------
patterns
String patterns to search and replace.
Accepts expression input. Strings are parsed as column names, and other
non-expression inputs are parsed as literals. Also accepts a mapping of
patterns to their replacement as syntactic sugar for
`replace_many(pl.Series(mapping.keys()), pl.Series(mapping.values()))`.
replace_with
Strings to replace where a pattern was a match.
This can be broadcast, so it supports many:one and many:many.
Accepts expression input. Non-expression inputs are parsed as literals.
Length must match the length of `patterns` or have length 1. This can be
broadcasted, so it supports many:one and many:many.
ascii_case_insensitive
Enable ASCII-aware case-insensitive matching.
When this option is enabled, searching will be performed without respect
to case for ASCII letters (a-z and A-Z) only.

Notes
-----
This method supports matching on string literals only, and does not support
regular expression matching.

Examples
--------
Replace many patterns by passing sequences of equal length to the `patterns` and
`replace_with` parameters.

>>> _ = pl.Config.set_fmt_str_lengths(100)
>>> _ = pl.Config.set_tbl_width_chars(110)
>>> df = pl.DataFrame(
... {
... "lyrics": [
... "Everybody wants to rule the world",
... "Tell me what you want, what you really really want",
... "Can you feel the love tonight",
... ]
... }
... )
>>> df.with_columns(
... pl.col("lyrics")
... .str.replace_many(
... ["me", "you"],
... ["you", "me"],
... )
... .alias("confusing")
... )
shape: (3, 2)
┌────────────────────────────────────────────────────┬───────────────────────────────────────────────────┐
│ lyrics ┆ confusing │
│ --- ┆ --- │
│ str ┆ str │
╞════════════════════════════════════════════════════╪═══════════════════════════════════════════════════╡
│ Everybody wants to rule the world ┆ Everybody wants to rule the world │
│ Tell me what you want, what you really really want ┆ Tell you what me want, what me really really want │
│ Can you feel the love tonight ┆ Can me feel the love tonight │
└────────────────────────────────────────────────────┴───────────────────────────────────────────────────┘

Broadcast a replacement for many patterns by passing a string or a sequence of
length 1 to the `replace_with` parameter.

>>> _ = pl.Config.set_fmt_str_lengths(100)
>>> df = pl.DataFrame(
... {
Expand Down Expand Up @@ -2499,27 +2551,50 @@ def replace_many(
│ Tell me what you want, what you really really want ┆ Tell what want, what really really want │
│ Can you feel the love tonight ┆ Can feel the love tonight │
└────────────────────────────────────────────────────┴────────────────────────────────────────────┘

Passing a mapping with patterns and replacements is also supported as syntactic
sugar.

>>> _ = pl.Config.set_fmt_str_lengths(100)
>>> _ = pl.Config.set_tbl_width_chars(110)
>>> df = pl.DataFrame(
... {
... "lyrics": [
... "Everybody wants to rule the world",
... "Tell me what you want, what you really really want",
... "Can you feel the love tonight",
... ]
... }
... )
>>> mapping = {"me": "you", "you": "me", "want": "need"}
>>> df.with_columns(
... pl.col("lyrics")
... .str.replace_many(
... ["me", "you"],
... ["you", "me"],
... )
... .alias("confusing")
... ) # doctest: +IGNORE_RESULT
... pl.col("lyrics").str.replace_many(mapping).alias("confusing")
... )
shape: (3, 2)
┌────────────────────────────────────────────────────┬───────────────────────────────────────────────────┐
│ lyrics ┆ confusing │
│ --- ┆ --- │
│ str ┆ str │
╞════════════════════════════════════════════════════╪═══════════════════════════════════════════════════╡
│ Everybody wants to rule the world ┆ Everybody wants to rule the world │
│ Tell me what you want, what you really really want ┆ Tell you what me want, what me really really want
│ Everybody wants to rule the world ┆ Everybody needs to rule the world │
│ Tell me what you want, what you really really want ┆ Tell you what me need, what me really really need
│ Can you feel the love tonight ┆ Can me feel the love tonight │
└────────────────────────────────────────────────────┴───────────────────────────────────────────────────┘
""" # noqa: W505
if replace_with is no_default:
if not isinstance(patterns, Mapping):
msg = "`replace_with` argument is required if `patterns` argument is not a Mapping type"
raise TypeError(msg)
# Early return in case of an empty mapping.
if not patterns:
return wrap_expr(self._pyexpr)
replace_with = pl.Series(patterns.values())
patterns = pl.Series(patterns.keys())

patterns = parse_into_expression(
patterns, str_as_lit=False, list_as_series=True
patterns, # type: ignore[arg-type]
str_as_lit=False,
list_as_series=True,
)
replace_with = parse_into_expression(
replace_with, str_as_lit=True, list_as_series=True
Expand All @@ -2539,8 +2614,7 @@ def extract_many(
overlapping: bool = False,
) -> Expr:
"""

Use the aho-corasick algorithm to extract many matches.
Use the Aho-Corasick algorithm to extract many matches.

Parameters
----------
Expand All @@ -2553,6 +2627,11 @@ def extract_many(
overlapping
Whether matches may overlap.

Notes
-----
This method supports matching on string literals only, and does not support
regular expression matching.

Examples
--------
>>> _ = pl.Config.set_fmt_str_lengths(100)
Expand Down
Loading