Skip to content

Commit

Permalink
fix(rust, python): str.contains strict=False took no effect (#6950)
Browse files Browse the repository at this point in the history
  • Loading branch information
sorhawell committed Feb 17, 2023
1 parent c67e966 commit a7c6933
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ pub(super) fn contains(s: &[Series], literal: bool, strict: bool) -> PolarsResul
if literal {
ca.contains_literal(pat)?
} else {
ca.contains(pat)?
ca.contains(pat, strict)?
}
}
None => BooleanChunked::full(ca.name(), false, ca.len()),
Expand Down Expand Up @@ -150,10 +150,7 @@ pub(super) fn contains(s: &[Series], literal: bool, strict: bool) -> PolarsResul
ca.into_iter()
.zip(pat.into_iter())
.map(|(opt_src, opt_val)| match (opt_src, opt_val) {
(Some(src), Some(pat)) => {
let re = Regex::new(pat).ok()?;
Some(re.is_match(src))
}
(Some(src), Some(pat)) => Regex::new(pat).ok().map(|re| re.is_match(src)),
_ => Some(false),
})
.collect_trusted()
Expand Down
24 changes: 16 additions & 8 deletions polars/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,14 +160,22 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
}

/// Check if strings contain a regex pattern.
fn contains(&self, pat: &str) -> PolarsResult<BooleanChunked> {
fn contains(&self, pat: &str, strict: bool) -> PolarsResult<BooleanChunked> {
let ca = self.as_utf8();
let reg = Regex::new(pat)?;
let f = |s: &str| reg.is_match(s);
let mut out: BooleanChunked = if !ca.has_validity() {
ca.into_no_null_iter().map(f).collect()
} else {
ca.into_iter().map(|opt_s| opt_s.map(f)).collect()

let res_reg = Regex::new(pat);
let opt_reg = if strict { Some(res_reg?) } else { res_reg.ok() };

let mut out: BooleanChunked = match (opt_reg, ca.has_validity()) {
(Some(reg), false) => ca
.into_no_null_iter()
.map(|s: &str| reg.is_match(s))
.collect(),
(Some(reg), true) => ca
.into_iter()
.map(|opt_s| opt_s.map(|s: &str| reg.is_match(s)))
.collect(),
(None, _) => ca.into_iter().map(|_| None).collect(),
};
out.rename(ca.name());
Ok(out)
Expand All @@ -178,7 +186,7 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
// note: benchmarking shows that the regex engine is actually
// faster at finding literal matches than str::contains.
// ref: https://github.com/pola-rs/polars/pull/6811
self.contains(escape(lit).as_str())
self.contains(escape(lit).as_str(), true)
}

/// Check if strings ends with a substring
Expand Down
2 changes: 1 addition & 1 deletion polars/tests/it/lazy/expressions/arity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ fn includes_null_predicate_3038() -> PolarsResult<()> {
move |s| {
s.utf8()?
.to_lowercase()
.contains("not_exist")
.contains("not_exist", true)
.map(|ca| Some(ca.into_series()))
},
GetOutput::from_type(DataType::Boolean),
Expand Down
13 changes: 13 additions & 0 deletions py-polars/tests/unit/namespaces/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,19 @@ def test_auto_explode() -> None:


def test_contains() -> None:
# test strict/non strict
s_txt = pl.Series(["123", "456", "789"])
assert (
pl.Series([None, None, None]).cast(pl.Boolean).to_list()
== s_txt.str.contains("(not_valid_regex", literal=False, strict=False).to_list()
)
with pytest.raises(pl.ComputeError):
s_txt.str.contains("(not_valid_regex", literal=False, strict=True)
assert (
pl.Series([True, False, False]).cast(pl.Boolean).to_list()
== s_txt.str.contains("1", literal=False, strict=False).to_list()
)

df = pl.DataFrame(
data=[(1, "some * * text"), (2, "(with) special\n * chars"), (3, "**etc...?$")],
schema=["idx", "text"],
Expand Down

0 comments on commit a7c6933

Please sign in to comment.