Skip to content

Commit

Permalink
feat(python): automagically upconvert kwarg column expressions with m…
Browse files Browse the repository at this point in the history
…ultiple output names to struct form
  • Loading branch information
alexander-beedie committed Jan 28, 2023
1 parent 6146f3b commit bef9da0
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 45 deletions.
4 changes: 2 additions & 2 deletions py-polars/polars/cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"POLARS_VERBOSE",
}
# register Config-local attributes (with their defaults) here
POLARS_CFG_LOCAL_VARS = {"with_columns_kwargs": False}
POLARS_CFG_LOCAL_VARS = {"with_columns_kwargs": True}


class Config:
Expand Down Expand Up @@ -74,7 +74,7 @@ def __exit__(

# note: class-local attributes can be used for options that don't have
# a Rust component (so, no need to register environment variables).
with_columns_kwargs: bool = False
with_columns_kwargs: bool = True

@classmethod
def load(cls, cfg: str) -> type[Config]:
Expand Down
42 changes: 29 additions & 13 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5596,12 +5596,12 @@ def with_columns(
**named_exprs: Any,
) -> DataFrame:
"""
Return a new DataFrame with the columns added, if new, or replaced.
Return a new DataFrame with the columns added (if new), or replaced.
Notes
-----
Creating a new DataFrame using this method does not create a new copy of
existing data.
Creating a new DataFrame using this method does not create a new copy
of existing data.
Parameters
----------
Expand All @@ -5620,7 +5620,7 @@ def with_columns(
... }
... )
Passing in a single expression, adding the column as we give it a new name:
Passing in a single expression, adding (and naming) a new column:
>>> df.with_columns((pl.col("a") ** 2).alias("a^2"))
shape: (4, 4)
Expand All @@ -5635,8 +5635,8 @@ def with_columns(
│ 4 ┆ 13.0 ┆ true ┆ 16.0 │
└─────┴──────┴───────┴──────┘
We can also override a column, by giving the expression a name that already
exists:
We can also override an existing column by giving the expression
a name that already exists:
>>> df.with_columns((pl.col("a") ** 2).alias("c"))
shape: (4, 3)
Expand All @@ -5651,7 +5651,7 @@ def with_columns(
│ 4 ┆ 13.0 ┆ 16.0 │
└─────┴──────┴──────┘
Passing in multiple expressions as a list:
Multiple expressions can be passed in as both a list...
>>> df.with_columns(
... [
Expand All @@ -5672,17 +5672,15 @@ def with_columns(
│ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
└─────┴──────┴───────┴──────┴──────┴───────┘
Support for kwarg expressions is considered EXPERIMENTAL. Currently
requires opt-in via `pl.Config` boolean flag:
...or via kwarg expressions:
>>> pl.Config.with_columns_kwargs = True
>>> df.with_columns(
... d=pl.col("a") * pl.col("b"),
... e=pl.col("c").is_not(),
... ab=pl.col("a") * pl.col("b"),
... not_c=pl.col("c").is_not(),
... )
shape: (4, 5)
┌─────┬──────┬───────┬──────┬───────┐
│ a ┆ b ┆ c ┆ d ┆ e
│ a ┆ b ┆ c ┆ ab ┆ not_c
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │
╞═════╪══════╪═══════╪══════╪═══════╡
Expand All @@ -5692,6 +5690,24 @@ def with_columns(
│ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
└─────┴──────┴───────┴──────┴───────┘
Note that, when using kwarg syntax, expressions with multiple
outputs are automatically instantiated as Struct columns:
>>> df.drop("c").with_columns(
... diff=pl.col(["a", "b"]).diff().suffix("_diff"),
... )
shape: (4, 3)
┌─────┬──────┬─────────────┐
│ a ┆ b ┆ diff │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ struct[2] │
╞═════╪══════╪═════════════╡
│ 1 ┆ 0.5 ┆ {null,null} │
│ 2 ┆ 4.0 ┆ {1,3.5} │
│ 3 ┆ 10.0 ┆ {1,6.0} │
│ 4 ┆ 13.0 ┆ {1,3.0} │
└─────┴──────┴─────────────┘
"""
return (
self.lazy().with_columns(exprs, **named_exprs).collect(no_optimization=True)
Expand Down
26 changes: 18 additions & 8 deletions py-polars/polars/internals/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def expr_to_lit_or_expr(
| Sequence[int | float | str | None]
),
str_to_lit: bool = True,
structify: bool = False,
) -> Expr:
"""
Convert args to expressions.
Expand All @@ -101,32 +102,41 @@ def expr_to_lit_or_expr(
str_to_lit
If True string argument `"foo"` will be converted to `lit("foo")`,
If False it will be converted to `col("foo")`
structify
If the final unaliased expression has multiple output names,
automagically convert it to struct.
Returns
-------
Expr
"""
if isinstance(expr, str) and not str_to_lit:
return pli.col(expr)
expr = pli.col(expr)
elif (
isinstance(expr, (int, float, str, pli.Series, datetime, date, time, timedelta))
or expr is None
):
return pli.lit(expr)
elif isinstance(expr, Expr):
return expr
expr = pli.lit(expr)
structify = False
elif isinstance(expr, list):
return pli.lit(pli.Series("", [expr]))
expr = pli.lit(pli.Series("", [expr]))
structify = False
elif isinstance(expr, (pli.WhenThen, pli.WhenThenThen)):
# implicitly add the null branch.
return expr.otherwise(None)
else:
expr = expr.otherwise(None) # implicitly add the null branch.
elif not isinstance(expr, Expr):
raise ValueError(
f"did not expect value {expr} of type {type(expr)}, maybe disambiguate with"
" pl.lit or pl.col"
)

if structify:
unaliased_expr = expr.meta.undo_aliases()
if unaliased_expr.meta.has_multiple_outputs():
expr = cast(Expr, pli.struct(expr))

return expr


def wrap_expr(pyexpr: PyExpr) -> Expr:
return Expr._from_pyexpr(pyexpr)
Expand Down
38 changes: 27 additions & 11 deletions py-polars/polars/internals/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2454,7 +2454,7 @@ def with_columns(
... }
... ).lazy()
Passing in a single expression, adding the column as we give it a new name:
Passing in a single expression, adding (and naming) a new column:
>>> ldf.with_columns((pl.col("a") ** 2).alias("a^2")).collect()
shape: (4, 4)
Expand All @@ -2469,8 +2469,8 @@ def with_columns(
│ 4 ┆ 13.0 ┆ true ┆ 16.0 │
└─────┴──────┴───────┴──────┘
We can also override a column, by giving the expression a name that already
exists:
We can also override an existing column by giving the expression
a name that already exists:
>>> ldf.with_columns((pl.col("a") ** 2).alias("c")).collect()
shape: (4, 3)
Expand All @@ -2485,7 +2485,7 @@ def with_columns(
│ 4 ┆ 13.0 ┆ 16.0 │
└─────┴──────┴──────┘
Passing in multiple expressions as a list:
Multiple expressions can be passed in as both a list...
>>> ldf.with_columns(
... [
Expand All @@ -2506,17 +2506,15 @@ def with_columns(
│ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
└─────┴──────┴───────┴──────┴──────┴───────┘
Support for kwarg expressions is considered EXPERIMENTAL. Currently
requires opt-in via `pl.Config` boolean flag:
...or via kwarg expressions:
>>> pl.Config.with_columns_kwargs = True
>>> ldf.with_columns(
... d=pl.col("a") * pl.col("b"),
... e=pl.col("c").is_not(),
... ab=pl.col("a") * pl.col("b"),
... not_c=pl.col("c").is_not(),
... ).collect()
shape: (4, 5)
┌─────┬──────┬───────┬──────┬───────┐
│ a ┆ b ┆ c ┆ d ┆ e
│ a ┆ b ┆ c ┆ ab ┆ not_c
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │
╞═════╪══════╪═══════╪══════╪═══════╡
Expand All @@ -2526,6 +2524,24 @@ def with_columns(
│ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
└─────┴──────┴───────┴──────┴───────┘
Note that, when using kwarg syntax, expressions with multiple
outputs are automatically instantiated as Struct columns:
>>> ldf.drop("c").with_columns(
... diff=pl.col(["a", "b"]).diff().suffix("_diff"),
... ).collect()
shape: (4, 3)
┌─────┬──────┬─────────────┐
│ a ┆ b ┆ diff │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ struct[2] │
╞═════╪══════╪═════════════╡
│ 1 ┆ 0.5 ┆ {null,null} │
│ 2 ┆ 4.0 ┆ {1,3.5} │
│ 3 ┆ 10.0 ┆ {1,6.0} │
│ 4 ┆ 13.0 ┆ {1,3.0} │
└─────┴──────┴─────────────┘
"""
if named_exprs and not Config.with_columns_kwargs:
raise RuntimeError(
Expand All @@ -2545,7 +2561,7 @@ def with_columns(
exprs = list(exprs)

exprs.extend(
pli.expr_to_lit_or_expr(expr).alias(name)
pli.expr_to_lit_or_expr(expr, structify=True).alias(name)
for name, expr in named_exprs.items()
)
pyexprs = []
Expand Down
18 changes: 9 additions & 9 deletions py-polars/tests/unit/test_cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,33 +411,33 @@ def test_string_cache() -> None:


def test_config_load_save() -> None:
# set some config options
pl.Config.with_columns_kwargs = True
# set some config options...
pl.Config.with_columns_kwargs = False
pl.Config.set_verbose(True)
assert os.environ["POLARS_VERBOSE"] == "1"

cfg = pl.Config.save()
assert isinstance(cfg, str)
assert "POLARS_VERBOSE" in pl.Config.state(if_set=True)

# unset the saved options
pl.Config.with_columns_kwargs = False
# ...modify the same options...
pl.Config.with_columns_kwargs = True
pl.Config.set_verbose(False)
assert os.environ["POLARS_VERBOSE"] == "0"

# now load back from config...
# ...load back from config...
pl.Config.load(cfg)

# ...and confirm the saved options were set
# ...and confirm the saved options were set.
assert os.environ["POLARS_VERBOSE"] == "1"
assert pl.Config.with_columns_kwargs is True
assert pl.Config.with_columns_kwargs is False

# restore explicitly-set config options (unsets from env)
# restore all default options (unsets from env)
pl.Config.restore_defaults()
assert "POLARS_VERBOSE" not in pl.Config.state(if_set=True)
assert "POLARS_VERBOSE" in pl.Config.state()
assert os.environ.get("POLARS_VERBOSE") is None
assert pl.Config.with_columns_kwargs is False
assert pl.Config.with_columns_kwargs is True


def test_config_scope() -> None:
Expand Down
47 changes: 45 additions & 2 deletions py-polars/tests/unit/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -2481,9 +2481,52 @@ def test_with_columns() -> None:
)
assert_frame_equal(dx, expected)

# at least one of exprs/**named_exprs required
# automatically upconvert multi-output expressions to struct
ldf = (
pl.DataFrame({"x1": [1, 2, 6], "x2": [1, 2, 3]})
.lazy()
.with_columns(
pct_change=pl.col(["x1", "x2"]).pct_change(),
maxes=pl.all().max().suffix("_max"),
xcols=pl.col("^x.*$"),
)
)
# ┌─────┬─────┬─────────────┬───────────┬───────────┐
# │ x1 ┆ x2 ┆ pct_change ┆ maxes ┆ xcols │
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │
# ╞═════╪═════╪═════════════╪═══════════╪═══════════╡
# │ 1 ┆ 1 ┆ {null,null} ┆ {6,3} ┆ {1,1} │
# │ 2 ┆ 2 ┆ {1.0,1.0} ┆ {6,3} ┆ {2,2} │
# │ 6 ┆ 3 ┆ {2.0,0.5} ┆ {6,3} ┆ {6,3} │
# └─────┴─────┴─────────────┴───────────┴───────────┘
assert ldf.collect().to_dicts() == [
{
"x1": 1,
"x2": 1,
"pct_change": {"x1": None, "x2": None},
"maxes": {"x1_max": 6, "x2_max": 3},
"xcols": {"x1": 1, "x2": 1},
},
{
"x1": 2,
"x2": 2,
"pct_change": {"x1": 1.0, "x2": 1.0},
"maxes": {"x1_max": 6, "x2_max": 3},
"xcols": {"x1": 2, "x2": 2},
},
{
"x1": 6,
"x2": 3,
"pct_change": {"x1": 2.0, "x2": 0.5},
"maxes": {"x1_max": 6, "x2_max": 3},
"xcols": {"x1": 6, "x2": 3},
},
]

# require at least one of exprs / **named_exprs
with pytest.raises(ValueError):
_ = df.with_columns()
_ = ldf.with_columns()


def test_len_compute(df: pl.DataFrame) -> None:
Expand Down
1 change: 1 addition & 0 deletions py-polars/tests/unit/test_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,4 @@ def test_meta_has_multiple_outputs() -> None:
def test_meta_is_regex_projection() -> None:
e = pl.col("^.*$").alias("bar")
assert e.meta.is_regex_projection()
assert e.meta.has_multiple_outputs()

0 comments on commit bef9da0

Please sign in to comment.