From caefb64b8131202660036ddaf03035f0ab8a4f70 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Sat, 28 Jan 2023 00:08:36 +0900 Subject: [PATCH] feat(python): automagically upconvert kwarg column expressions with multiple output names to struct form --- py-polars/polars/cfg.py | 4 +- py-polars/polars/internals/dataframe/frame.py | 42 ++++++++++++----- py-polars/polars/internals/expr/expr.py | 26 ++++++---- py-polars/polars/internals/lazyframe/frame.py | 38 ++++++++++----- py-polars/tests/unit/test_cfg.py | 18 +++---- py-polars/tests/unit/test_df.py | 47 ++++++++++++++++++- py-polars/tests/unit/test_meta.py | 1 + 7 files changed, 131 insertions(+), 45 deletions(-) diff --git a/py-polars/polars/cfg.py b/py-polars/polars/cfg.py index ae1c25ec4e5e..0cb3f07d6b50 100644 --- a/py-polars/polars/cfg.py +++ b/py-polars/polars/cfg.py @@ -38,7 +38,7 @@ "POLARS_VERBOSE", } # register Config-local attributes (with their defaults) here -POLARS_CFG_LOCAL_VARS = {"with_columns_kwargs": False} +POLARS_CFG_LOCAL_VARS = {"with_columns_kwargs": True} class Config: @@ -74,7 +74,7 @@ def __exit__( # note: class-local attributes can be used for options that don't have # a Rust component (so, no need to register environment variables). - with_columns_kwargs: bool = False + with_columns_kwargs: bool = True @classmethod def load(cls, cfg: str) -> type[Config]: diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py index 34c684fc69d0..8131d8ad7e39 100644 --- a/py-polars/polars/internals/dataframe/frame.py +++ b/py-polars/polars/internals/dataframe/frame.py @@ -5596,12 +5596,12 @@ def with_columns( **named_exprs: Any, ) -> DataFrame: """ - Return a new DataFrame with the columns added, if new, or replaced. + Return a new DataFrame with the columns added (if new), or replaced. Notes ----- - Creating a new DataFrame using this method does not create a new copy of - existing data. + Creating a new DataFrame using this method does not create a new copy + of existing data. Parameters ---------- @@ -5620,7 +5620,7 @@ def with_columns( ... } ... ) - Passing in a single expression, adding the column as we give it a new name: + Passing in a single expression, adding (and naming) a new column: >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) shape: (4, 4) @@ -5635,8 +5635,8 @@ def with_columns( │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ └─────┴──────┴───────┴──────┘ - We can also override a column, by giving the expression a name that already - exists: + We can also override an existing column by giving the expression + a name that already exists: >>> df.with_columns((pl.col("a") ** 2).alias("c")) shape: (4, 3) @@ -5651,7 +5651,7 @@ def with_columns( │ 4 ┆ 13.0 ┆ 16.0 │ └─────┴──────┴──────┘ - Passing in multiple expressions as a list: + Multiple expressions can be passed in as both a list... >>> df.with_columns( ... [ @@ -5672,17 +5672,15 @@ def with_columns( │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ └─────┴──────┴───────┴──────┴──────┴───────┘ - Support for kwarg expressions is considered EXPERIMENTAL. Currently - requires opt-in via `pl.Config` boolean flag: + ...or via kwarg expressions: - >>> pl.Config.with_columns_kwargs = True >>> df.with_columns( - ... d=pl.col("a") * pl.col("b"), - ... e=pl.col("c").is_not(), + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), ... ) shape: (4, 5) ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ d ┆ e │ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ ╞═════╪══════╪═══════╪══════╪═══════╡ @@ -5692,6 +5690,24 @@ def with_columns( │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ └─────┴──────┴───────┴──────┴───────┘ + Note that, when using kwarg syntax, expressions with multiple + outputs are automatically instantiated as Struct columns: + + >>> df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + """ return ( self.lazy().with_columns(exprs, **named_exprs).collect(no_optimization=True) diff --git a/py-polars/polars/internals/expr/expr.py b/py-polars/polars/internals/expr/expr.py index 8075fb153d5e..5f9bd2e3f71e 100644 --- a/py-polars/polars/internals/expr/expr.py +++ b/py-polars/polars/internals/expr/expr.py @@ -90,6 +90,7 @@ def expr_to_lit_or_expr( | Sequence[int | float | str | None] ), str_to_lit: bool = True, + structify: bool = False, ) -> Expr: """ Convert args to expressions. @@ -101,6 +102,9 @@ def expr_to_lit_or_expr( str_to_lit If True string argument `"foo"` will be converted to `lit("foo")`, If False it will be converted to `col("foo")` + structify + If the final unaliased expression has multiple output names, + automagically convert it to struct. Returns ------- @@ -108,25 +112,31 @@ def expr_to_lit_or_expr( """ if isinstance(expr, str) and not str_to_lit: - return pli.col(expr) + expr = pli.col(expr) elif ( isinstance(expr, (int, float, str, pli.Series, datetime, date, time, timedelta)) or expr is None ): - return pli.lit(expr) - elif isinstance(expr, Expr): - return expr + expr = pli.lit(expr) + structify = False elif isinstance(expr, list): - return pli.lit(pli.Series("", [expr])) + expr = pli.lit(pli.Series("", [expr])) + structify = False elif isinstance(expr, (pli.WhenThen, pli.WhenThenThen)): - # implicitly add the null branch. - return expr.otherwise(None) - else: + expr = expr.otherwise(None) # implicitly add the null branch. + elif not isinstance(expr, Expr): raise ValueError( f"did not expect value {expr} of type {type(expr)}, maybe disambiguate with" " pl.lit or pl.col" ) + if structify: + unaliased_expr = expr.meta.undo_aliases() + if unaliased_expr.meta.has_multiple_outputs(): + expr = cast(Expr, pli.struct(expr)) + + return expr + def wrap_expr(pyexpr: PyExpr) -> Expr: return Expr._from_pyexpr(pyexpr) diff --git a/py-polars/polars/internals/lazyframe/frame.py b/py-polars/polars/internals/lazyframe/frame.py index 71e6ec2504c4..6f1854fc919c 100644 --- a/py-polars/polars/internals/lazyframe/frame.py +++ b/py-polars/polars/internals/lazyframe/frame.py @@ -2454,7 +2454,7 @@ def with_columns( ... } ... ).lazy() - Passing in a single expression, adding the column as we give it a new name: + Passing in a single expression, adding (and naming) a new column: >>> ldf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() shape: (4, 4) @@ -2469,8 +2469,8 @@ def with_columns( │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ └─────┴──────┴───────┴──────┘ - We can also override a column, by giving the expression a name that already - exists: + We can also override an existing column by giving the expression + a name that already exists: >>> ldf.with_columns((pl.col("a") ** 2).alias("c")).collect() shape: (4, 3) @@ -2485,7 +2485,7 @@ def with_columns( │ 4 ┆ 13.0 ┆ 16.0 │ └─────┴──────┴──────┘ - Passing in multiple expressions as a list: + Multiple expressions can be passed in as both a list... >>> ldf.with_columns( ... [ @@ -2506,17 +2506,15 @@ def with_columns( │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ └─────┴──────┴───────┴──────┴──────┴───────┘ - Support for kwarg expressions is considered EXPERIMENTAL. Currently - requires opt-in via `pl.Config` boolean flag: + ...or via kwarg expressions: - >>> pl.Config.with_columns_kwargs = True >>> ldf.with_columns( - ... d=pl.col("a") * pl.col("b"), - ... e=pl.col("c").is_not(), + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), ... ).collect() shape: (4, 5) ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ d ┆ e │ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ ╞═════╪══════╪═══════╪══════╪═══════╡ @@ -2526,6 +2524,24 @@ def with_columns( │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ └─────┴──────┴───────┴──────┴───────┘ + Note that, when using kwarg syntax, expressions with multiple + outputs are automatically instantiated as Struct columns: + + >>> ldf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + """ if named_exprs and not Config.with_columns_kwargs: raise RuntimeError( @@ -2545,7 +2561,7 @@ def with_columns( exprs = list(exprs) exprs.extend( - pli.expr_to_lit_or_expr(expr).alias(name) + pli.expr_to_lit_or_expr(expr, structify=True).alias(name) for name, expr in named_exprs.items() ) pyexprs = [] diff --git a/py-polars/tests/unit/test_cfg.py b/py-polars/tests/unit/test_cfg.py index 8744037956dd..1d37c93b40bb 100644 --- a/py-polars/tests/unit/test_cfg.py +++ b/py-polars/tests/unit/test_cfg.py @@ -411,8 +411,8 @@ def test_string_cache() -> None: def test_config_load_save() -> None: - # set some config options - pl.Config.with_columns_kwargs = True + # set some config options... + pl.Config.with_columns_kwargs = False pl.Config.set_verbose(True) assert os.environ["POLARS_VERBOSE"] == "1" @@ -420,24 +420,24 @@ def test_config_load_save() -> None: assert isinstance(cfg, str) assert "POLARS_VERBOSE" in pl.Config.state(if_set=True) - # unset the saved options - pl.Config.with_columns_kwargs = False + # ...modify the same options... + pl.Config.with_columns_kwargs = True pl.Config.set_verbose(False) assert os.environ["POLARS_VERBOSE"] == "0" - # now load back from config... + # ...load back from config... pl.Config.load(cfg) - # ...and confirm the saved options were set + # ...and confirm the saved options were set. assert os.environ["POLARS_VERBOSE"] == "1" - assert pl.Config.with_columns_kwargs is True + assert pl.Config.with_columns_kwargs is False - # restore explicitly-set config options (unsets from env) + # restore all default options (unsets from env) pl.Config.restore_defaults() assert "POLARS_VERBOSE" not in pl.Config.state(if_set=True) assert "POLARS_VERBOSE" in pl.Config.state() assert os.environ.get("POLARS_VERBOSE") is None - assert pl.Config.with_columns_kwargs is False + assert pl.Config.with_columns_kwargs is True def test_config_scope() -> None: diff --git a/py-polars/tests/unit/test_df.py b/py-polars/tests/unit/test_df.py index d03c0b0a78d5..f254ead95203 100644 --- a/py-polars/tests/unit/test_df.py +++ b/py-polars/tests/unit/test_df.py @@ -2481,9 +2481,52 @@ def test_with_columns() -> None: ) assert_frame_equal(dx, expected) - # at least one of exprs/**named_exprs required + # automatically upconvert multi-output expressions to struct + ldf = ( + pl.DataFrame({"x1": [1, 2, 6], "x2": [1, 2, 3]}) + .lazy() + .with_columns( + pct_change=pl.col(["x1", "x2"]).pct_change(), + maxes=pl.all().max().suffix("_max"), + xcols=pl.col("^x.*$"), + ) + ) + # ┌─────┬─────┬─────────────┬───────────┬───────────┐ + # │ x1 ┆ x2 ┆ pct_change ┆ maxes ┆ xcols │ + # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + # │ i64 ┆ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │ + # ╞═════╪═════╪═════════════╪═══════════╪═══════════╡ + # │ 1 ┆ 1 ┆ {null,null} ┆ {6,3} ┆ {1,1} │ + # │ 2 ┆ 2 ┆ {1.0,1.0} ┆ {6,3} ┆ {2,2} │ + # │ 6 ┆ 3 ┆ {2.0,0.5} ┆ {6,3} ┆ {6,3} │ + # └─────┴─────┴─────────────┴───────────┴───────────┘ + assert ldf.collect().to_dicts() == [ + { + "x1": 1, + "x2": 1, + "pct_change": {"x1": None, "x2": None}, + "maxes": {"x1_max": 6, "x2_max": 3}, + "xcols": {"x1": 1, "x2": 1}, + }, + { + "x1": 2, + "x2": 2, + "pct_change": {"x1": 1.0, "x2": 1.0}, + "maxes": {"x1_max": 6, "x2_max": 3}, + "xcols": {"x1": 2, "x2": 2}, + }, + { + "x1": 6, + "x2": 3, + "pct_change": {"x1": 2.0, "x2": 0.5}, + "maxes": {"x1_max": 6, "x2_max": 3}, + "xcols": {"x1": 6, "x2": 3}, + }, + ] + + # require at least one of exprs / **named_exprs with pytest.raises(ValueError): - _ = df.with_columns() + _ = ldf.with_columns() def test_len_compute(df: pl.DataFrame) -> None: diff --git a/py-polars/tests/unit/test_meta.py b/py-polars/tests/unit/test_meta.py index f45aa5bee8c3..cc83a43ee228 100644 --- a/py-polars/tests/unit/test_meta.py +++ b/py-polars/tests/unit/test_meta.py @@ -58,3 +58,4 @@ def test_meta_has_multiple_outputs() -> None: def test_meta_is_regex_projection() -> None: e = pl.col("^.*$").alias("bar") assert e.meta.is_regex_projection() + assert e.meta.has_multiple_outputs()