Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

depr(python): Deprecate LazyFrame.with_context in favor of horizontal concatenation #16860

Merged
merged 1 commit into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4221,10 +4221,16 @@ def with_columns_seq(
)
return self._from_pyldf(self._ldf.with_columns_seq(pyexprs))

@deprecate_function(
"Use `pl.concat(..., how='horizontal')` instead.", version="1.0.0"
)
def with_context(self, other: Self | list[Self]) -> Self:
"""
Add an external context to the computation graph.

.. deprecated:: 1.0.0
Use :func:`concat` instead with `how='horizontal'`

This allows expressions to also access columns from DataFrames
that are not part of this one.

Expand All @@ -4237,7 +4243,7 @@ def with_context(self, other: Self | list[Self]) -> Self:
--------
>>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]})
>>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]})
>>> lf.with_context(lf_other).select(
>>> lf.with_context(lf_other).select( # doctest: +SKIP
... pl.col("b") + pl.col("c").first()
... ).collect()
shape: (3, 1)
Expand All @@ -4259,7 +4265,7 @@ def with_context(self, other: Self | list[Self]) -> Self:
>>> test_lf = pl.LazyFrame(
... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]}
... )
>>> test_lf.with_context(
>>> test_lf.with_context( # doctest: +SKIP
... train_lf.select(pl.all().name.suffix("_train"))
... ).select(
... pl.col("feature_0").fill_null(pl.col("feature_0_train").median())
Expand Down
112 changes: 109 additions & 3 deletions py-polars/tests/unit/lazyframe/test_with_context.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,121 @@
from datetime import datetime

import pytest

import polars as pl
from polars.testing import assert_frame_equal


def test_with_context() -> None:
df_a = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "c", None]}).lazy()
df_b = pl.DataFrame({"c": ["foo", "ham"]})

with pytest.deprecated_call():
result = df_a.with_context(df_b.lazy()).select(
pl.col("b") + pl.col("c").first()
)
assert result.collect().to_dict(as_series=False) == {"b": ["afoo", "cfoo", None]}

with pytest.deprecated_call():
context = df_a.with_context(df_b.lazy())
with pytest.raises(pl.ComputeError):
context.select("a", "c").collect()


# https://github.com/pola-rs/polars/issues/5867
def test_with_context_ignore_5867() -> None:
outer = pl.LazyFrame({"OtherCol": [1, 2, 3, 4]})
lf = pl.LazyFrame({"Category": [1, 1, 2, 2], "Counts": [1, 2, 3, 4]}).with_context(
outer
)
with pytest.deprecated_call():
lf = pl.LazyFrame(
{"Category": [1, 1, 2, 2], "Counts": [1, 2, 3, 4]}
).with_context(outer)

result = lf.group_by("Category", maintain_order=True).agg(pl.col("Counts").sum())

expected = pl.LazyFrame({"Category": [1, 2], "Counts": [3, 7]})
assert_frame_equal(result, expected)


def test_predicate_pushdown_with_context_11014() -> None:
df1 = pl.LazyFrame(
{
"df1_c1": [1, 2, 3],
"df1_c2": [2, 3, 4],
}
)

df2 = pl.LazyFrame(
{
"df2_c1": [2, 3, 4],
"df2_c2": [3, 4, 5],
}
)

with pytest.deprecated_call():
out = (
df1.with_context(df2)
.filter(pl.col("df1_c1").is_in(pl.col("df2_c1")))
.collect(predicate_pushdown=True)
)

assert out.to_dict(as_series=False) == {"df1_c1": [2, 3], "df1_c2": [3, 4]}


@pytest.mark.xdist_group("streaming")
def test_streaming_11219() -> None:
# https://github.com/pola-rs/polars/issues/11219

lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]})
lf_other = pl.LazyFrame({"c": ["foo", "ham"]})
lf_other2 = pl.LazyFrame({"c": ["foo", "ham"]})

with pytest.deprecated_call():
context = lf.with_context([lf_other, lf_other2])

assert context.select(pl.col("b") + pl.col("c").first()).collect(
streaming=True
).to_dict(as_series=False) == {"b": ["afoo", "cfoo", None]}


def test_no_cse_in_with_context() -> None:
df1 = pl.DataFrame(
{
"timestamp": [
datetime(2023, 1, 1, 0, 0),
datetime(2023, 5, 1, 0, 0),
datetime(2023, 10, 1, 0, 0),
],
"value": [2, 5, 9],
}
)
df2 = pl.DataFrame(
{
"date_start": [
datetime(2022, 12, 31, 0, 0),
datetime(2023, 1, 2, 0, 0),
],
"date_end": [
datetime(2023, 4, 30, 0, 0),
datetime(2023, 5, 5, 0, 0),
],
"label": [0, 1],
}
)

with pytest.deprecated_call():
context = df1.lazy().with_context(df2.lazy())

assert (
context.select(
pl.col("date_start", "label").gather(
pl.col("date_start").search_sorted(pl.col("timestamp")) - 1
),
)
).collect().to_dict(as_series=False) == {
"date_start": [
datetime(2022, 12, 31, 0, 0),
datetime(2023, 1, 2, 0, 0),
datetime(2023, 1, 2, 0, 0),
],
"label": [0, 1, 1],
}
10 changes: 0 additions & 10 deletions py-polars/tests/unit/streaming/test_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,16 +284,6 @@ def test_boolean_agg_schema() -> None:
)


def test_streaming_11219() -> None:
lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]})
lf_other = pl.LazyFrame({"c": ["foo", "ham"]})
lf_other2 = pl.LazyFrame({"c": ["foo", "ham"]})

assert lf.with_context([lf_other, lf_other2]).select(
pl.col("b") + pl.col("c").first()
).collect(streaming=True).to_dict(as_series=False) == {"b": ["afoo", "cfoo", None]}


@pytest.mark.write_disk()
def test_streaming_csv_headers_but_no_data_13770(tmp_path: Path) -> None:
with Path.open(tmp_path / "header_no_data.csv", "w") as f:
Expand Down
43 changes: 0 additions & 43 deletions py-polars/tests/unit/test_cse.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,49 +489,6 @@ def test_cse_count_in_group_by() -> None:
}


def test_no_cse_in_with_context() -> None:
df1 = pl.DataFrame(
{
"timestamp": [
datetime(2023, 1, 1, 0, 0),
datetime(2023, 5, 1, 0, 0),
datetime(2023, 10, 1, 0, 0),
],
"value": [2, 5, 9],
}
)
df2 = pl.DataFrame(
{
"date_start": [
datetime(2022, 12, 31, 0, 0),
datetime(2023, 1, 2, 0, 0),
],
"date_end": [
datetime(2023, 4, 30, 0, 0),
datetime(2023, 5, 5, 0, 0),
],
"label": [0, 1],
}
)

assert (
df1.lazy()
.with_context(df2.lazy())
.select(
pl.col("date_start", "label").gather(
pl.col("date_start").search_sorted(pl.col("timestamp")) - 1
),
)
).collect().to_dict(as_series=False) == {
"date_start": [
datetime(2022, 12, 31, 0, 0),
datetime(2023, 1, 2, 0, 0),
datetime(2023, 1, 2, 0, 0),
],
"label": [0, 1, 1],
}


def test_cse_slice_11594() -> None:
df = pl.LazyFrame({"a": [1, 2, 1, 2, 1, 2]})

Expand Down
24 changes: 0 additions & 24 deletions py-polars/tests/unit/test_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,30 +132,6 @@ def test_predicate_pushdown_block_8661() -> None:
}


def test_predicate_pushdown_with_context_11014() -> None:
df1 = pl.LazyFrame(
{
"df1_c1": [1, 2, 3],
"df1_c2": [2, 3, 4],
}
)

df2 = pl.LazyFrame(
{
"df2_c1": [2, 3, 4],
"df2_c2": [3, 4, 5],
}
)

out = (
df1.with_context(df2)
.filter(pl.col("df1_c1").is_in(pl.col("df2_c1")))
.collect(predicate_pushdown=True)
)

assert out.to_dict(as_series=False) == {"df1_c1": [2, 3], "df1_c2": [3, 4]}


def test_predicate_pushdown_cumsum_9566() -> None:
df = pl.DataFrame({"A": range(10), "B": ["b"] * 5 + ["a"] * 5})

Expand Down
12 changes: 0 additions & 12 deletions py-polars/tests/unit/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,18 +156,6 @@ def test_bool_numeric_supertype() -> None:
)


def test_with_context() -> None:
df_a = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "c", None]}).lazy()
df_b = pl.DataFrame({"c": ["foo", "ham"]})

assert (
df_a.with_context(df_b.lazy()).select([pl.col("b") + pl.col("c").first()])
).collect().to_dict(as_series=False) == {"b": ["afoo", "cfoo", None]}

with pytest.raises(pl.ComputeError):
(df_a.with_context(df_b.lazy()).select(["a", "c"])).collect()


def test_from_dicts_nested_nulls() -> None:
assert pl.from_dicts([{"a": [None, None]}, {"a": [1, 2]}]).to_dict(
as_series=False
Expand Down