From e921030c0201d441453ea0fd7953386725a36cfa Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 10 Jun 2024 22:03:18 +0200 Subject: [PATCH] depr(python): Deprecate `LazyFrame.with_context` --- py-polars/polars/lazyframe/frame.py | 10 +- .../tests/unit/lazyframe/test_with_context.py | 112 +++++++++++++++++- .../tests/unit/streaming/test_streaming.py | 10 -- py-polars/tests/unit/test_cse.py | 43 ------- py-polars/tests/unit/test_predicates.py | 24 ---- py-polars/tests/unit/test_schema.py | 12 -- 6 files changed, 117 insertions(+), 94 deletions(-) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 49be56ea4423..9818301c0ee0 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -4221,10 +4221,16 @@ def with_columns_seq( ) return self._from_pyldf(self._ldf.with_columns_seq(pyexprs)) + @deprecate_function( + "Use `pl.concat(..., how='horizontal')` instead.", version="1.0.0" + ) def with_context(self, other: Self | list[Self]) -> Self: """ Add an external context to the computation graph. + .. deprecated:: 1.0.0 + Use :func:`concat` instead with `how='horizontal'` + This allows expressions to also access columns from DataFrames that are not part of this one. @@ -4237,7 +4243,7 @@ def with_context(self, other: Self | list[Self]) -> Self: -------- >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) - >>> lf.with_context(lf_other).select( + >>> lf.with_context(lf_other).select( # doctest: +SKIP ... pl.col("b") + pl.col("c").first() ... ).collect() shape: (3, 1) @@ -4259,7 +4265,7 @@ def with_context(self, other: Self | list[Self]) -> Self: >>> test_lf = pl.LazyFrame( ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} ... ) - >>> test_lf.with_context( + >>> test_lf.with_context( # doctest: +SKIP ... train_lf.select(pl.all().name.suffix("_train")) ... ).select( ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) diff --git a/py-polars/tests/unit/lazyframe/test_with_context.py b/py-polars/tests/unit/lazyframe/test_with_context.py index 6983b7a5060f..43eee761d28f 100644 --- a/py-polars/tests/unit/lazyframe/test_with_context.py +++ b/py-polars/tests/unit/lazyframe/test_with_context.py @@ -1,15 +1,121 @@ +from datetime import datetime + +import pytest + import polars as pl from polars.testing import assert_frame_equal +def test_with_context() -> None: + df_a = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "c", None]}).lazy() + df_b = pl.DataFrame({"c": ["foo", "ham"]}) + + with pytest.deprecated_call(): + result = df_a.with_context(df_b.lazy()).select( + pl.col("b") + pl.col("c").first() + ) + assert result.collect().to_dict(as_series=False) == {"b": ["afoo", "cfoo", None]} + + with pytest.deprecated_call(): + context = df_a.with_context(df_b.lazy()) + with pytest.raises(pl.ComputeError): + context.select("a", "c").collect() + + # https://github.com/pola-rs/polars/issues/5867 def test_with_context_ignore_5867() -> None: outer = pl.LazyFrame({"OtherCol": [1, 2, 3, 4]}) - lf = pl.LazyFrame({"Category": [1, 1, 2, 2], "Counts": [1, 2, 3, 4]}).with_context( - outer - ) + with pytest.deprecated_call(): + lf = pl.LazyFrame( + {"Category": [1, 1, 2, 2], "Counts": [1, 2, 3, 4]} + ).with_context(outer) result = lf.group_by("Category", maintain_order=True).agg(pl.col("Counts").sum()) expected = pl.LazyFrame({"Category": [1, 2], "Counts": [3, 7]}) assert_frame_equal(result, expected) + + +def test_predicate_pushdown_with_context_11014() -> None: + df1 = pl.LazyFrame( + { + "df1_c1": [1, 2, 3], + "df1_c2": [2, 3, 4], + } + ) + + df2 = pl.LazyFrame( + { + "df2_c1": [2, 3, 4], + "df2_c2": [3, 4, 5], + } + ) + + with pytest.deprecated_call(): + out = ( + df1.with_context(df2) + .filter(pl.col("df1_c1").is_in(pl.col("df2_c1"))) + .collect(predicate_pushdown=True) + ) + + assert out.to_dict(as_series=False) == {"df1_c1": [2, 3], "df1_c2": [3, 4]} + + +@pytest.mark.xdist_group("streaming") +def test_streaming_11219() -> None: + # https://github.com/pola-rs/polars/issues/11219 + + lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + lf_other2 = pl.LazyFrame({"c": ["foo", "ham"]}) + + with pytest.deprecated_call(): + context = lf.with_context([lf_other, lf_other2]) + + assert context.select(pl.col("b") + pl.col("c").first()).collect( + streaming=True + ).to_dict(as_series=False) == {"b": ["afoo", "cfoo", None]} + + +def test_no_cse_in_with_context() -> None: + df1 = pl.DataFrame( + { + "timestamp": [ + datetime(2023, 1, 1, 0, 0), + datetime(2023, 5, 1, 0, 0), + datetime(2023, 10, 1, 0, 0), + ], + "value": [2, 5, 9], + } + ) + df2 = pl.DataFrame( + { + "date_start": [ + datetime(2022, 12, 31, 0, 0), + datetime(2023, 1, 2, 0, 0), + ], + "date_end": [ + datetime(2023, 4, 30, 0, 0), + datetime(2023, 5, 5, 0, 0), + ], + "label": [0, 1], + } + ) + + with pytest.deprecated_call(): + context = df1.lazy().with_context(df2.lazy()) + + assert ( + context.select( + pl.col("date_start", "label").gather( + pl.col("date_start").search_sorted(pl.col("timestamp")) - 1 + ), + ) + ).collect().to_dict(as_series=False) == { + "date_start": [ + datetime(2022, 12, 31, 0, 0), + datetime(2023, 1, 2, 0, 0), + datetime(2023, 1, 2, 0, 0), + ], + "label": [0, 1, 1], + } diff --git a/py-polars/tests/unit/streaming/test_streaming.py b/py-polars/tests/unit/streaming/test_streaming.py index 46d968902a9c..b2fa4eacc9fe 100644 --- a/py-polars/tests/unit/streaming/test_streaming.py +++ b/py-polars/tests/unit/streaming/test_streaming.py @@ -284,16 +284,6 @@ def test_boolean_agg_schema() -> None: ) -def test_streaming_11219() -> None: - lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) - lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) - lf_other2 = pl.LazyFrame({"c": ["foo", "ham"]}) - - assert lf.with_context([lf_other, lf_other2]).select( - pl.col("b") + pl.col("c").first() - ).collect(streaming=True).to_dict(as_series=False) == {"b": ["afoo", "cfoo", None]} - - @pytest.mark.write_disk() def test_streaming_csv_headers_but_no_data_13770(tmp_path: Path) -> None: with Path.open(tmp_path / "header_no_data.csv", "w") as f: diff --git a/py-polars/tests/unit/test_cse.py b/py-polars/tests/unit/test_cse.py index 0081bc15ca43..ca413318e3b3 100644 --- a/py-polars/tests/unit/test_cse.py +++ b/py-polars/tests/unit/test_cse.py @@ -489,49 +489,6 @@ def test_cse_count_in_group_by() -> None: } -def test_no_cse_in_with_context() -> None: - df1 = pl.DataFrame( - { - "timestamp": [ - datetime(2023, 1, 1, 0, 0), - datetime(2023, 5, 1, 0, 0), - datetime(2023, 10, 1, 0, 0), - ], - "value": [2, 5, 9], - } - ) - df2 = pl.DataFrame( - { - "date_start": [ - datetime(2022, 12, 31, 0, 0), - datetime(2023, 1, 2, 0, 0), - ], - "date_end": [ - datetime(2023, 4, 30, 0, 0), - datetime(2023, 5, 5, 0, 0), - ], - "label": [0, 1], - } - ) - - assert ( - df1.lazy() - .with_context(df2.lazy()) - .select( - pl.col("date_start", "label").gather( - pl.col("date_start").search_sorted(pl.col("timestamp")) - 1 - ), - ) - ).collect().to_dict(as_series=False) == { - "date_start": [ - datetime(2022, 12, 31, 0, 0), - datetime(2023, 1, 2, 0, 0), - datetime(2023, 1, 2, 0, 0), - ], - "label": [0, 1, 1], - } - - def test_cse_slice_11594() -> None: df = pl.LazyFrame({"a": [1, 2, 1, 2, 1, 2]}) diff --git a/py-polars/tests/unit/test_predicates.py b/py-polars/tests/unit/test_predicates.py index bcf93fb654be..2c9a7ba3c842 100644 --- a/py-polars/tests/unit/test_predicates.py +++ b/py-polars/tests/unit/test_predicates.py @@ -132,30 +132,6 @@ def test_predicate_pushdown_block_8661() -> None: } -def test_predicate_pushdown_with_context_11014() -> None: - df1 = pl.LazyFrame( - { - "df1_c1": [1, 2, 3], - "df1_c2": [2, 3, 4], - } - ) - - df2 = pl.LazyFrame( - { - "df2_c1": [2, 3, 4], - "df2_c2": [3, 4, 5], - } - ) - - out = ( - df1.with_context(df2) - .filter(pl.col("df1_c1").is_in(pl.col("df2_c1"))) - .collect(predicate_pushdown=True) - ) - - assert out.to_dict(as_series=False) == {"df1_c1": [2, 3], "df1_c2": [3, 4]} - - def test_predicate_pushdown_cumsum_9566() -> None: df = pl.DataFrame({"A": range(10), "B": ["b"] * 5 + ["a"] * 5}) diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index d777af51a524..fd6b154f06cf 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -156,18 +156,6 @@ def test_bool_numeric_supertype() -> None: ) -def test_with_context() -> None: - df_a = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "c", None]}).lazy() - df_b = pl.DataFrame({"c": ["foo", "ham"]}) - - assert ( - df_a.with_context(df_b.lazy()).select([pl.col("b") + pl.col("c").first()]) - ).collect().to_dict(as_series=False) == {"b": ["afoo", "cfoo", None]} - - with pytest.raises(pl.ComputeError): - (df_a.with_context(df_b.lazy()).select(["a", "c"])).collect() - - def test_from_dicts_nested_nulls() -> None: assert pl.from_dicts([{"a": [None, None]}, {"a": [1, 2]}]).to_dict( as_series=False