Skip to content

Commit

Permalink
depr(python): Deprecate LazyFrame.with_context (#16860)
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored Jun 11, 2024
1 parent 0b0af39 commit 1a2707d
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 94 deletions.
10 changes: 8 additions & 2 deletions py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4221,10 +4221,16 @@ def with_columns_seq(
)
return self._from_pyldf(self._ldf.with_columns_seq(pyexprs))

@deprecate_function(
"Use `pl.concat(..., how='horizontal')` instead.", version="1.0.0"
)
def with_context(self, other: Self | list[Self]) -> Self:
"""
Add an external context to the computation graph.
.. deprecated:: 1.0.0
Use :func:`concat` instead with `how='horizontal'`
This allows expressions to also access columns from DataFrames
that are not part of this one.
Expand All @@ -4237,7 +4243,7 @@ def with_context(self, other: Self | list[Self]) -> Self:
--------
>>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]})
>>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]})
>>> lf.with_context(lf_other).select(
>>> lf.with_context(lf_other).select( # doctest: +SKIP
... pl.col("b") + pl.col("c").first()
... ).collect()
shape: (3, 1)
Expand All @@ -4259,7 +4265,7 @@ def with_context(self, other: Self | list[Self]) -> Self:
>>> test_lf = pl.LazyFrame(
... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]}
... )
>>> test_lf.with_context(
>>> test_lf.with_context( # doctest: +SKIP
... train_lf.select(pl.all().name.suffix("_train"))
... ).select(
... pl.col("feature_0").fill_null(pl.col("feature_0_train").median())
Expand Down
112 changes: 109 additions & 3 deletions py-polars/tests/unit/lazyframe/test_with_context.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,121 @@
from datetime import datetime

import pytest

import polars as pl
from polars.testing import assert_frame_equal


def test_with_context() -> None:
df_a = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "c", None]}).lazy()
df_b = pl.DataFrame({"c": ["foo", "ham"]})

with pytest.deprecated_call():
result = df_a.with_context(df_b.lazy()).select(
pl.col("b") + pl.col("c").first()
)
assert result.collect().to_dict(as_series=False) == {"b": ["afoo", "cfoo", None]}

with pytest.deprecated_call():
context = df_a.with_context(df_b.lazy())
with pytest.raises(pl.ComputeError):
context.select("a", "c").collect()


# https://github.com/pola-rs/polars/issues/5867
def test_with_context_ignore_5867() -> None:
outer = pl.LazyFrame({"OtherCol": [1, 2, 3, 4]})
lf = pl.LazyFrame({"Category": [1, 1, 2, 2], "Counts": [1, 2, 3, 4]}).with_context(
outer
)
with pytest.deprecated_call():
lf = pl.LazyFrame(
{"Category": [1, 1, 2, 2], "Counts": [1, 2, 3, 4]}
).with_context(outer)

result = lf.group_by("Category", maintain_order=True).agg(pl.col("Counts").sum())

expected = pl.LazyFrame({"Category": [1, 2], "Counts": [3, 7]})
assert_frame_equal(result, expected)


def test_predicate_pushdown_with_context_11014() -> None:
df1 = pl.LazyFrame(
{
"df1_c1": [1, 2, 3],
"df1_c2": [2, 3, 4],
}
)

df2 = pl.LazyFrame(
{
"df2_c1": [2, 3, 4],
"df2_c2": [3, 4, 5],
}
)

with pytest.deprecated_call():
out = (
df1.with_context(df2)
.filter(pl.col("df1_c1").is_in(pl.col("df2_c1")))
.collect(predicate_pushdown=True)
)

assert out.to_dict(as_series=False) == {"df1_c1": [2, 3], "df1_c2": [3, 4]}


@pytest.mark.xdist_group("streaming")
def test_streaming_11219() -> None:
# https://github.com/pola-rs/polars/issues/11219

lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]})
lf_other = pl.LazyFrame({"c": ["foo", "ham"]})
lf_other2 = pl.LazyFrame({"c": ["foo", "ham"]})

with pytest.deprecated_call():
context = lf.with_context([lf_other, lf_other2])

assert context.select(pl.col("b") + pl.col("c").first()).collect(
streaming=True
).to_dict(as_series=False) == {"b": ["afoo", "cfoo", None]}


def test_no_cse_in_with_context() -> None:
df1 = pl.DataFrame(
{
"timestamp": [
datetime(2023, 1, 1, 0, 0),
datetime(2023, 5, 1, 0, 0),
datetime(2023, 10, 1, 0, 0),
],
"value": [2, 5, 9],
}
)
df2 = pl.DataFrame(
{
"date_start": [
datetime(2022, 12, 31, 0, 0),
datetime(2023, 1, 2, 0, 0),
],
"date_end": [
datetime(2023, 4, 30, 0, 0),
datetime(2023, 5, 5, 0, 0),
],
"label": [0, 1],
}
)

with pytest.deprecated_call():
context = df1.lazy().with_context(df2.lazy())

assert (
context.select(
pl.col("date_start", "label").gather(
pl.col("date_start").search_sorted(pl.col("timestamp")) - 1
),
)
).collect().to_dict(as_series=False) == {
"date_start": [
datetime(2022, 12, 31, 0, 0),
datetime(2023, 1, 2, 0, 0),
datetime(2023, 1, 2, 0, 0),
],
"label": [0, 1, 1],
}
10 changes: 0 additions & 10 deletions py-polars/tests/unit/streaming/test_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,16 +284,6 @@ def test_boolean_agg_schema() -> None:
)


def test_streaming_11219() -> None:
lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]})
lf_other = pl.LazyFrame({"c": ["foo", "ham"]})
lf_other2 = pl.LazyFrame({"c": ["foo", "ham"]})

assert lf.with_context([lf_other, lf_other2]).select(
pl.col("b") + pl.col("c").first()
).collect(streaming=True).to_dict(as_series=False) == {"b": ["afoo", "cfoo", None]}


@pytest.mark.write_disk()
def test_streaming_csv_headers_but_no_data_13770(tmp_path: Path) -> None:
with Path.open(tmp_path / "header_no_data.csv", "w") as f:
Expand Down
43 changes: 0 additions & 43 deletions py-polars/tests/unit/test_cse.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,49 +489,6 @@ def test_cse_count_in_group_by() -> None:
}


def test_no_cse_in_with_context() -> None:
df1 = pl.DataFrame(
{
"timestamp": [
datetime(2023, 1, 1, 0, 0),
datetime(2023, 5, 1, 0, 0),
datetime(2023, 10, 1, 0, 0),
],
"value": [2, 5, 9],
}
)
df2 = pl.DataFrame(
{
"date_start": [
datetime(2022, 12, 31, 0, 0),
datetime(2023, 1, 2, 0, 0),
],
"date_end": [
datetime(2023, 4, 30, 0, 0),
datetime(2023, 5, 5, 0, 0),
],
"label": [0, 1],
}
)

assert (
df1.lazy()
.with_context(df2.lazy())
.select(
pl.col("date_start", "label").gather(
pl.col("date_start").search_sorted(pl.col("timestamp")) - 1
),
)
).collect().to_dict(as_series=False) == {
"date_start": [
datetime(2022, 12, 31, 0, 0),
datetime(2023, 1, 2, 0, 0),
datetime(2023, 1, 2, 0, 0),
],
"label": [0, 1, 1],
}


def test_cse_slice_11594() -> None:
df = pl.LazyFrame({"a": [1, 2, 1, 2, 1, 2]})

Expand Down
24 changes: 0 additions & 24 deletions py-polars/tests/unit/test_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,30 +132,6 @@ def test_predicate_pushdown_block_8661() -> None:
}


def test_predicate_pushdown_with_context_11014() -> None:
df1 = pl.LazyFrame(
{
"df1_c1": [1, 2, 3],
"df1_c2": [2, 3, 4],
}
)

df2 = pl.LazyFrame(
{
"df2_c1": [2, 3, 4],
"df2_c2": [3, 4, 5],
}
)

out = (
df1.with_context(df2)
.filter(pl.col("df1_c1").is_in(pl.col("df2_c1")))
.collect(predicate_pushdown=True)
)

assert out.to_dict(as_series=False) == {"df1_c1": [2, 3], "df1_c2": [3, 4]}


def test_predicate_pushdown_cumsum_9566() -> None:
df = pl.DataFrame({"A": range(10), "B": ["b"] * 5 + ["a"] * 5})

Expand Down
12 changes: 0 additions & 12 deletions py-polars/tests/unit/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,18 +156,6 @@ def test_bool_numeric_supertype() -> None:
)


def test_with_context() -> None:
df_a = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "c", None]}).lazy()
df_b = pl.DataFrame({"c": ["foo", "ham"]})

assert (
df_a.with_context(df_b.lazy()).select([pl.col("b") + pl.col("c").first()])
).collect().to_dict(as_series=False) == {"b": ["afoo", "cfoo", None]}

with pytest.raises(pl.ComputeError):
(df_a.with_context(df_b.lazy()).select(["a", "c"])).collect()


def test_from_dicts_nested_nulls() -> None:
assert pl.from_dicts([{"a": [None, None]}, {"a": [1, 2]}]).to_dict(
as_series=False
Expand Down

0 comments on commit 1a2707d

Please sign in to comment.