Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add cat.starts_with/cat.ends_with #20257

Merged
merged 8 commits into from
Jan 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions crates/polars-plan/src/dsl/cat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,20 @@ impl CategoricalNameSpace {
self.0
.map_private(FunctionExpr::Categorical(CategoricalFunction::LenChars))
}

#[cfg(feature = "strings")]
pub fn starts_with(self, prefix: String) -> Expr {
self.0
.map_private(FunctionExpr::Categorical(CategoricalFunction::StartsWith(
prefix,
)))
}

#[cfg(feature = "strings")]
pub fn ends_with(self, suffix: String) -> Expr {
self.0
.map_private(FunctionExpr::Categorical(CategoricalFunction::EndsWith(
suffix,
)))
}
}
56 changes: 50 additions & 6 deletions crates/polars-plan/src/dsl/function_expr/cat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ pub enum CategoricalFunction {
LenBytes,
#[cfg(feature = "strings")]
LenChars,
#[cfg(feature = "strings")]
StartsWith(String),
#[cfg(feature = "strings")]
EndsWith(String),
}

impl CategoricalFunction {
Expand All @@ -20,6 +24,10 @@ impl CategoricalFunction {
LenBytes => mapper.with_dtype(DataType::UInt32),
#[cfg(feature = "strings")]
LenChars => mapper.with_dtype(DataType::UInt32),
#[cfg(feature = "strings")]
StartsWith(_) => mapper.with_dtype(DataType::Boolean),
#[cfg(feature = "strings")]
EndsWith(_) => mapper.with_dtype(DataType::Boolean),
}
}
}
Expand All @@ -33,6 +41,10 @@ impl Display for CategoricalFunction {
LenBytes => "len_bytes",
#[cfg(feature = "strings")]
LenChars => "len_chars",
#[cfg(feature = "strings")]
StartsWith(_) => "starts_with",
#[cfg(feature = "strings")]
EndsWith(_) => "ends_with",
};
write!(f, "cat.{s}")
}
Expand All @@ -47,6 +59,10 @@ impl From<CategoricalFunction> for SpecialEq<Arc<dyn ColumnsUdf>> {
LenBytes => map!(len_bytes),
#[cfg(feature = "strings")]
LenChars => map!(len_chars),
#[cfg(feature = "strings")]
StartsWith(prefix) => map!(starts_with, prefix.as_str()),
#[cfg(feature = "strings")]
EndsWith(suffix) => map!(ends_with, suffix.as_str()),
}
}
}
Expand Down Expand Up @@ -83,28 +99,56 @@ fn _get_cat_phys_map(ca: &CategoricalChunked) -> (StringChunked, Series) {
(categories, phys)
}

/// Apply a function to the categories of a categorical column and broadcast the result back to the
/// array.
fn apply_to_cats<F, T>(s: &Column, mut op: F) -> PolarsResult<Column>
/// Fast path: apply a string function to the categories of a categorical column and broadcast the
/// result back to the array.
fn apply_to_cats<F, T>(ca: &CategoricalChunked, mut op: F) -> PolarsResult<Column>
where
F: FnMut(&StringChunked) -> ChunkedArray<T>,
ChunkedArray<T>: IntoSeries,
T: PolarsDataType<HasViews = FalseT, IsStruct = FalseT, IsNested = FalseT>,
{
let ca = s.categorical()?;
let (categories, phys) = _get_cat_phys_map(ca);
let result = op(&categories);
// SAFETY: physical idx array is valid.
let out = unsafe { result.take_unchecked(phys.idx().unwrap()) };
Ok(out.into_column())
}

/// Fast path: apply a binary function to the categories of a categorical column and broadcast the
/// result back to the array.
fn apply_to_cats_binary<F, T>(ca: &CategoricalChunked, mut op: F) -> PolarsResult<Column>
where
F: FnMut(&BinaryChunked) -> ChunkedArray<T>,
ChunkedArray<T>: IntoSeries,
T: PolarsDataType<HasViews = FalseT, IsStruct = FalseT, IsNested = FalseT>,
{
let (categories, phys) = _get_cat_phys_map(ca);
let result = op(&categories.as_binary());
// SAFETY: physical idx array is valid.
let out = unsafe { result.take_unchecked(phys.idx().unwrap()) };
Ok(out.into_column())
}

#[cfg(feature = "strings")]
fn len_bytes(s: &Column) -> PolarsResult<Column> {
apply_to_cats(s, |s| s.str_len_bytes())
let ca = s.categorical()?;
apply_to_cats(ca, |s| s.str_len_bytes())
}

#[cfg(feature = "strings")]
fn len_chars(s: &Column) -> PolarsResult<Column> {
apply_to_cats(s, |s| s.str_len_chars())
let ca = s.categorical()?;
apply_to_cats(ca, |s| s.str_len_chars())
}

#[cfg(feature = "strings")]
fn starts_with(s: &Column, prefix: &str) -> PolarsResult<Column> {
let ca = s.categorical()?;
apply_to_cats(ca, |s| s.starts_with(prefix))
}

#[cfg(feature = "strings")]
fn ends_with(s: &Column, suffix: &str) -> PolarsResult<Column> {
let ca = s.categorical()?;
apply_to_cats_binary(ca, |s| s.as_binary().ends_with(suffix.as_bytes()))
}
8 changes: 8 additions & 0 deletions crates/polars-python/src/expr/categorical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,12 @@ impl PyExpr {
fn cat_len_chars(&self) -> Self {
self.inner.clone().cat().len_chars().into()
}

fn cat_starts_with(&self, prefix: String) -> Self {
self.inner.clone().cat().starts_with(prefix).into()
}

fn cat_ends_with(&self, suffix: String) -> Self {
self.inner.clone().cat().ends_with(suffix).into()
}
}
2 changes: 2 additions & 0 deletions py-polars/docs/source/reference/expressions/categories.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ The following methods are available under the `expr.cat` attribute.
:toctree: api/
:template: autosummary/accessor_method.rst

Expr.cat.ends_with
Expr.cat.get_categories
Expr.cat.len_bytes
Expr.cat.len_chars
Expr.cat.starts_with
2 changes: 2 additions & 0 deletions py-polars/docs/source/reference/series/categories.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ The following methods are available under the `Series.cat` attribute.
:toctree: api/
:template: autosummary/accessor_method.rst

Series.cat.ends_with
Series.cat.get_categories
Series.cat.is_local
Series.cat.len_bytes
Series.cat.len_chars
Series.cat.starts_with
Series.cat.to_local
Series.cat.uses_lexical_ordering
108 changes: 108 additions & 0 deletions py-polars/polars/expr/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,111 @@ def len_chars(self) -> Expr:
└──────┴─────────┴─────────┘
"""
return wrap_expr(self._pyexpr.cat_len_chars())

def starts_with(self, prefix: str) -> Expr:
"""
Check if string representations of values start with a substring.

Parameters
----------
prefix
Prefix substring.

See Also
--------
contains : Check if string repr contains a substring that matches a pattern.
ends_with : Check if string repr end with a substring.

Notes
-----
Whereas `str.starts_with` allows expression inputs, `cat.starts_with` requires
a literal string value.

Examples
--------
>>> df = pl.DataFrame(
... {"fruits": pl.Series(["apple", "mango", None], dtype=pl.Categorical)}
... )
>>> df.with_columns(
... pl.col("fruits").cat.starts_with("app").alias("has_prefix"),
... )
shape: (3, 2)
┌────────┬────────────┐
│ fruits ┆ has_prefix │
│ --- ┆ --- │
│ cat ┆ bool │
╞════════╪════════════╡
│ apple ┆ true │
│ mango ┆ false │
│ null ┆ null │
└────────┴────────────┘

Using `starts_with` as a filter condition:

>>> df.filter(pl.col("fruits").cat.starts_with("app"))
shape: (1, 1)
┌────────┐
│ fruits │
│ --- │
│ cat │
╞════════╡
│ apple │
└────────┘
"""
if not isinstance(prefix, str):
msg = f"'prefix' must be a string; found {type(prefix)!r}"
raise TypeError(msg)
return wrap_expr(self._pyexpr.cat_starts_with(prefix))

def ends_with(self, suffix: str) -> Expr:
"""
Check if string representations of values end with a substring.

Parameters
----------
suffix
Suffix substring.

See Also
--------
contains : Check if string reprs contains a substring that matches a pattern.
starts_with : Check if string reprs start with a substring.

Notes
-----
Whereas `str.ends_with` allows expression inputs, `cat.ends_with` requires a
literal string value.

Examples
--------
>>> df = pl.DataFrame(
... {"fruits": pl.Series(["apple", "mango", None], dtype=pl.Categorical)}
... )
>>> df.with_columns(pl.col("fruits").cat.ends_with("go").alias("has_suffix"))
shape: (3, 2)
┌────────┬────────────┐
│ fruits ┆ has_suffix │
│ --- ┆ --- │
│ cat ┆ bool │
╞════════╪════════════╡
│ apple ┆ false │
│ mango ┆ true │
│ null ┆ null │
└────────┴────────────┘

Using `ends_with` as a filter condition:

>>> df.filter(pl.col("fruits").cat.ends_with("go"))
shape: (1, 1)
┌────────┐
│ fruits │
│ --- │
│ cat │
╞════════╡
│ mango │
└────────┘
"""
if not isinstance(suffix, str):
msg = f"'suffix' must be a string; found {type(suffix)!r}"
raise TypeError(msg)
return wrap_expr(self._pyexpr.cat_ends_with(suffix))
54 changes: 54 additions & 0 deletions py-polars/polars/series/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,57 @@ def len_chars(self) -> Series:
null
]
"""

def starts_with(self, prefix: str) -> Series:
"""
Check if string representations of values start with a substring.

Parameters
----------
prefix
Prefix substring.

See Also
--------
contains : Check if the string repr contains a substring that matches a pattern.
ends_with : Check if string repr ends with a substring.

Examples
--------
>>> s = pl.Series("fruits", ["apple", "mango", None], dtype=pl.Categorical)
>>> s.cat.starts_with("app")
shape: (3,)
Series: 'fruits' [bool]
[
true
false
null
]
"""

def ends_with(self, suffix: str) -> Series:
"""
Check if string representations of values end with a substring.

Parameters
----------
suffix
Suffix substring.

See Also
--------
contains : Check if the string repr contains a substring that matches a pattern.
starts_with : Check if string repr starts with a substring.

Examples
--------
>>> s = pl.Series("fruits", ["apple", "mango", None], dtype=pl.Categorical)
>>> s.cat.ends_with("go")
shape: (3,)
Series: 'fruits' [bool]
[
false
true
null
]
"""
2 changes: 1 addition & 1 deletion py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ def find(
]
"""

def ends_with(self, suffix: str | Expr) -> Series:
def ends_with(self, suffix: str | Expr | None) -> Series:
"""
Check if string values end with a substring.

Expand Down
49 changes: 49 additions & 0 deletions py-polars/tests/unit/operations/namespaces/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,3 +247,52 @@ def test_cat_len_chars() -> None:
}
)
assert_frame_equal(result_df, expected_df)


@pytest.mark.usefixtures("test_global_and_local")
def test_starts_ends_with() -> None:
s = pl.Series(
"a",
["hamburger_with_tomatoes", "nuts", "nuts", "lollypop", None],
dtype=pl.Categorical,
)
assert_series_equal(
s.cat.ends_with("pop"), pl.Series("a", [False, False, False, True, None])
)
assert_series_equal(
s.cat.starts_with("nu"), pl.Series("a", [False, True, True, False, None])
)

with pytest.raises(TypeError, match="'prefix' must be a string; found"):
s.cat.starts_with(None) # type: ignore[arg-type]

with pytest.raises(TypeError, match="'suffix' must be a string; found"):
s.cat.ends_with(None) # type: ignore[arg-type]

df = pl.DataFrame(
{
"a": pl.Series(
["hamburger_with_tomatoes", "nuts", "nuts", "lollypop", None],
dtype=pl.Categorical,
),
}
)

expected = {
"ends_pop": [False, False, False, True, None],
"starts_ham": [True, False, False, False, None],
}

assert (
df.select(
pl.col("a").cat.ends_with("pop").alias("ends_pop"),
pl.col("a").cat.starts_with("ham").alias("starts_ham"),
).to_dict(as_series=False)
== expected
)

with pytest.raises(TypeError, match="'prefix' must be a string; found"):
df.select(pl.col("a").cat.starts_with(None)) # type: ignore[arg-type]

with pytest.raises(TypeError, match="'suffix' must be a string; found"):
df.select(pl.col("a").cat.ends_with(None)) # type: ignore[arg-type]
Loading