From c538971d1709ac30e26c99942768e55483a604e2 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 23 Nov 2022 11:45:09 +0100 Subject: [PATCH] refactor(python)!: rollup breaking changes - Remove deprecated function arguments. - Deprecate 'allow_streaming' in favor of 'streaming' - Make 'collect', 'fetch', and 'show_graph' kwargs only --- py-polars/polars/internals/dataframe/frame.py | 30 ++++------------ .../polars/internals/dataframe/groupby.py | 10 +++--- py-polars/polars/internals/lazyframe/frame.py | 34 +++++++++---------- py-polars/polars/internals/series/series.py | 4 --- py-polars/tests/unit/test_df.py | 4 --- py-polars/tests/unit/test_groupby.py | 6 ++-- py-polars/tests/unit/test_projections.py | 2 +- 7 files changed, 31 insertions(+), 59 deletions(-) diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py index eb548f4556d7..c86cd9304f77 100644 --- a/py-polars/polars/internals/dataframe/frame.py +++ b/py-polars/polars/internals/dataframe/frame.py @@ -65,7 +65,6 @@ _prepare_row_count_args, _process_null_values, _timedelta_to_pl_duration, - deprecated_alias, format_path, handle_projection_columns, is_bool_sequence, @@ -2465,7 +2464,7 @@ def filter( return ( self.lazy() .filter(predicate) # type: ignore[arg-type] - .collect(no_optimization=True, string_cache=False) + .collect(no_optimization=True) ) def describe(self: DF) -> DF: @@ -2668,11 +2667,7 @@ def sort( """ if not isinstance(by, str) and isinstance(by, (Sequence, pli.Expr)): - df = ( - self.lazy() - .sort(by, reverse, nulls_last) - .collect(no_optimization=True, string_cache=False) - ) + df = self.lazy().sort(by, reverse, nulls_last).collect(no_optimization=True) return df return self._from_pydf(self._df.sort(by, reverse, nulls_last)) @@ -2782,7 +2777,6 @@ def slice(self: DF, offset: int, length: int | None = None) -> DF: length = self.height - offset + length return self._from_pydf(self._df.slice(offset, length)) - @deprecated_alias(length="n") def limit(self: DF, n: int = 5) -> DF: """ Get the first `n` rows. @@ -2818,7 +2812,6 @@ def limit(self: DF, n: int = 5) -> DF: """ return self.head(n) - @deprecated_alias(length="n") def head(self: DF, n: int = 5) -> DF: """ Get the first `n` rows. @@ -2854,7 +2847,6 @@ def head(self: DF, n: int = 5) -> DF: """ return self._from_pydf(self._df.head(n)) - @deprecated_alias(length="n") def tail(self: DF, n: int = 5) -> DF: """ Get the last `n` rows. @@ -4098,11 +4090,7 @@ def with_column(self, column: pli.Series | pli.Expr) -> DataFrame: └──────┴─────┘ """ - return ( - self.lazy() - .with_column(column) - .collect(no_optimization=True, string_cache=False) - ) + return self.lazy().with_column(column).collect(no_optimization=True) def hstack( self: DF, @@ -4254,7 +4242,6 @@ def extend(self: DF, other: DF) -> DF: self._df.extend(other._df) return self - @deprecated_alias(name="columns") def drop(self: DF, columns: str | Sequence[str]) -> DF: """ Remove column from DataFrame and return as new. @@ -5234,7 +5221,7 @@ def shift_and_fill(self, periods: int, fill_value: int | str | float) -> DataFra return ( self.lazy() .shift_and_fill(periods, fill_value) - .collect(no_optimization=True, string_cache=False) + .collect(no_optimization=True) ) def is_duplicated(self) -> pli.Series: @@ -5411,10 +5398,7 @@ def select( """ return self._from_pydf( - self.lazy() - .select(exprs) - .collect(no_optimization=True, string_cache=False) - ._df + self.lazy().select(exprs).collect(no_optimization=True)._df ) def with_columns( @@ -5490,9 +5474,7 @@ def with_columns( if exprs is not None and not isinstance(exprs, Sequence): exprs = [exprs] return ( - self.lazy() - .with_columns(exprs, **named_exprs) - .collect(no_optimization=True, string_cache=False) + self.lazy().with_columns(exprs, **named_exprs).collect(no_optimization=True) ) @overload diff --git a/py-polars/polars/internals/dataframe/groupby.py b/py-polars/polars/internals/dataframe/groupby.py index 1d234a317de3..035864cfca59 100644 --- a/py-polars/polars/internals/dataframe/groupby.py +++ b/py-polars/polars/internals/dataframe/groupby.py @@ -298,7 +298,7 @@ def agg(self, aggs: pli.Expr | Sequence[pli.Expr]) -> pli.DataFrame: .lazy() .groupby(self.by, maintain_order=self.maintain_order) .agg(aggs) - .collect(no_optimization=True, string_cache=False) + .collect(no_optimization=True) ) return self._dataframe_class._from_pydf(df._df) @@ -362,7 +362,7 @@ def head(self, n: int = 5) -> DF: .lazy() .groupby(self.by, self.maintain_order) .head(n) - .collect(no_optimization=True, string_cache=False) + .collect(no_optimization=True) ) return self._dataframe_class._from_pydf(df._df) @@ -426,7 +426,7 @@ def tail(self, n: int = 5) -> DF: .lazy() .groupby(self.by, self.maintain_order) .tail(n) - .collect(no_optimization=True, string_cache=False) + .collect(no_optimization=True) ) return self._dataframe_class._from_pydf(df._df) @@ -858,7 +858,7 @@ def agg(self, aggs: pli.Expr | Sequence[pli.Expr]) -> pli.DataFrame: self.time_column, self.period, self.offset, self.closed, self.by ) .agg(aggs) - .collect(no_optimization=True, string_cache=False) + .collect(no_optimization=True) ) @@ -911,7 +911,7 @@ def agg(self, aggs: pli.Expr | Sequence[pli.Expr]) -> pli.DataFrame: self.by, ) .agg(aggs) - .collect(no_optimization=True, string_cache=False) + .collect(no_optimization=True) ) diff --git a/py-polars/polars/internals/lazyframe/frame.py b/py-polars/polars/internals/lazyframe/frame.py index ff24431c58f7..211683acbfe3 100644 --- a/py-polars/polars/internals/lazyframe/frame.py +++ b/py-polars/polars/internals/lazyframe/frame.py @@ -43,6 +43,7 @@ _prepare_row_count_args, _process_null_values, _timedelta_to_pl_duration, + deprecated_alias, format_path, ) @@ -596,6 +597,7 @@ def describe_plan(self) -> str: """Create a string representation of the unoptimized query plan.""" return self._ldf.describe_plan() + @deprecated_alias(streaming="allow_streaming") def describe_optimized_plan( self, type_coercion: bool = True, @@ -604,7 +606,7 @@ def describe_optimized_plan( simplify_expression: bool = True, slice_pushdown: bool = True, common_subplan_elimination: bool = True, - allow_streaming: bool = False, + streaming: bool = False, ) -> str: """Create a string representation of the optimized query plan.""" ldf = self._ldf.optimization_toggle( @@ -614,14 +616,16 @@ def describe_optimized_plan( simplify_expression, slice_pushdown, common_subplan_elimination, - allow_streaming, + streaming, ) return ldf.describe_optimized_plan() + @deprecated_alias(streaming="allow_streaming") def show_graph( self, optimized: bool = True, + *, show: bool = True, output_path: str | None = None, raw_output: bool = False, @@ -632,7 +636,7 @@ def show_graph( simplify_expression: bool = True, slice_pushdown: bool = True, common_subplan_elimination: bool = True, - allow_streaming: bool = False, + streaming: bool = False, ) -> str | None: """ Show a plot of the query plan. Note that you should have graphviz installed. @@ -663,7 +667,7 @@ def show_graph( Will try to cache branching subplans that occur on self-joins or unions. common_subplan_elimination Will try to cache branching subplans that occur on self-joins or unions. - allow_streaming + streaming Run parts of the query in a streaming fashion (this is in an alpha state) """ @@ -674,7 +678,7 @@ def show_graph( simplify_expression, slice_pushdown, common_subplan_elimination, - allow_streaming, + streaming, ) dot = _ldf.to_dot(optimized) @@ -956,18 +960,18 @@ def profile( return df, timings + @deprecated_alias(allow_streaming="streaming") def collect( self, + *, type_coercion: bool = True, predicate_pushdown: bool = True, projection_pushdown: bool = True, - *, simplify_expression: bool = True, - string_cache: bool = False, no_optimization: bool = False, slice_pushdown: bool = True, common_subplan_elimination: bool = True, - allow_streaming: bool = False, + streaming: bool = False, ) -> pli.DataFrame: """ Collect into a DataFrame. @@ -985,16 +989,13 @@ def collect( Do projection pushdown optimization. simplify_expression Run simplify expressions optimization. - string_cache - This argument is deprecated. Please set the string cache globally. - The argument will be ignored no_optimization Turn off (certain) optimizations. slice_pushdown Slice pushdown optimization. common_subplan_elimination Will try to cache branching subplans that occur on self-joins or unions. - allow_streaming + streaming Run parts of the query in a streaming fashion (this is in an alpha state) Returns @@ -1031,7 +1032,7 @@ def collect( slice_pushdown = False common_subplan_elimination = False - if allow_streaming: + if streaming: common_subplan_elimination = False ldf = self._ldf.optimization_toggle( @@ -1041,18 +1042,18 @@ def collect( simplify_expression, slice_pushdown, common_subplan_elimination, - allow_streaming, + streaming, ) return pli.wrap_df(ldf.collect()) def fetch( self, n_rows: int = 500, + *, type_coercion: bool = True, predicate_pushdown: bool = True, projection_pushdown: bool = True, simplify_expression: bool = True, - string_cache: bool = False, no_optimization: bool = False, slice_pushdown: bool = True, common_subplan_elimination: bool = True, @@ -1081,9 +1082,6 @@ def fetch( Run projection pushdown optimization. simplify_expression Run simplify expressions optimization. - string_cache - This argument is deprecated. Please set the string cache globally. - The argument will be ignored no_optimization Turn off optimizations. slice_pushdown diff --git a/py-polars/polars/internals/series/series.py b/py-polars/polars/internals/series/series.py index b7ce6d2d6dc5..7bdacf86ab1b 100644 --- a/py-polars/polars/internals/series/series.py +++ b/py-polars/polars/internals/series/series.py @@ -76,7 +76,6 @@ _datetime_to_pl_timestamp, _time_to_pl_time, accessor, - deprecated_alias, is_bool_sequence, is_int_sequence, range_to_slice, @@ -1607,7 +1606,6 @@ def cumprod(self, reverse: bool = False) -> Series: """ - @deprecated_alias(num_elements="n") def limit(self, n: int = 10) -> Series: """ Get the first `n` rows. @@ -1746,7 +1744,6 @@ def filter(self, predicate: Series | list[bool]) -> Series: predicate = Series("", predicate) return wrap_s(self._s.filter(predicate._s)) - @deprecated_alias(length="n") def head(self, n: int = 10) -> Series: """ Get the first `n` rows. @@ -1770,7 +1767,6 @@ def head(self, n: int = 10) -> Series: """ return self.to_frame().select(pli.col(self.name).head(n)).to_series() - @deprecated_alias(length="n") def tail(self, n: int = 10) -> Series: """ Get the last `n` rows. diff --git a/py-polars/tests/unit/test_df.py b/py-polars/tests/unit/test_df.py index 00d421ad5ee9..a16337ffafbc 100644 --- a/py-polars/tests/unit/test_df.py +++ b/py-polars/tests/unit/test_df.py @@ -679,10 +679,6 @@ def test_extend() -> None: def test_drop() -> None: df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]}) - with pytest.deprecated_call(): - df = df.drop(name="a") # type: ignore[call-arg] - assert df.shape == (3, 2) - df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]}) df = df.drop(columns="a") assert df.shape == (3, 2) df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]}) diff --git a/py-polars/tests/unit/test_groupby.py b/py-polars/tests/unit/test_groupby.py index 36a69324ea88..ea4442d3f2d4 100644 --- a/py-polars/tests/unit/test_groupby.py +++ b/py-polars/tests/unit/test_groupby.py @@ -227,11 +227,11 @@ def test_streaming_non_streaming_gb() -> None: n = 100 df = pl.DataFrame({"a": np.random.randint(0, 20, n)}) q = df.lazy().groupby("a").agg(pl.count()).sort("a") - assert q.collect(allow_streaming=True).frame_equal(q.collect()) + assert q.collect(streaming=True).frame_equal(q.collect()) q = df.lazy().with_column(pl.col("a").cast(pl.Utf8)) q = q.groupby("a").agg(pl.count()).sort("a") - assert q.collect(allow_streaming=True).frame_equal(q.collect()) + assert q.collect(streaming=True).frame_equal(q.collect()) q = df.lazy().with_column(pl.col("a").alias("b")) q = q.groupby(["a", "b"]).agg(pl.count()).sort("a") - assert q.collect(allow_streaming=True).frame_equal(q.collect()) + assert q.collect(streaming=True).frame_equal(q.collect()) diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py index 79d990192930..1663141d5128 100644 --- a/py-polars/tests/unit/test_projections.py +++ b/py-polars/tests/unit/test_projections.py @@ -99,7 +99,7 @@ def test_unnest_columns_available() -> None: def test_streaming_duplicate_cols_5537() -> None: assert pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}).lazy().with_columns( [(pl.col("a") * 2).alias("foo"), (pl.col("a") * 3)] - ).collect(allow_streaming=True).to_dict(False) == { + ).collect(streaming=True).to_dict(False) == { "a": [3, 6, 9], "b": [1, 2, 3], "foo": [2, 4, 6],