Skip to content

Commit

Permalink
feat(python): automagically upconvert with_columns kwarg expression…
Browse files Browse the repository at this point in the history
…s with multiple output names to struct; extend `**named_kwargs` support to `select` (pola-rs#6497)

Co-authored-by: Ritchie Vink <ritchie46@gmail.com>
  • Loading branch information
2 people authored and cojmeister committed Jan 30, 2023
1 parent b9ab0f4 commit fa3f48c
Show file tree
Hide file tree
Showing 8 changed files with 228 additions and 60 deletions.
4 changes: 2 additions & 2 deletions py-polars/polars/cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"POLARS_VERBOSE",
}
# register Config-local attributes (with their defaults) here
POLARS_CFG_LOCAL_VARS = {"with_columns_kwargs": False}
POLARS_CFG_LOCAL_VARS = {"with_columns_kwargs": True}


class Config:
Expand Down Expand Up @@ -74,7 +74,7 @@ def __exit__(

# note: class-local attributes can be used for options that don't have
# a Rust component (so, no need to register environment variables).
with_columns_kwargs: bool = False
with_columns_kwargs: bool = True

@classmethod
def load(cls, cfg: str) -> type[Config]:
Expand Down
70 changes: 53 additions & 17 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5506,7 +5506,9 @@ def select(
| pli.Expr
| pli.Series
| Iterable[str | pli.Expr | pli.Series | pli.WhenThen | pli.WhenThenThen]
),
| None
) = None,
**named_exprs: Any,
) -> DF:
"""
Select columns from this DataFrame.
Expand All @@ -5515,6 +5517,8 @@ def select(
----------
exprs
Column or columns to select.
**named_exprs
Named column expressions, provided as kwargs.
Examples
--------
Expand Down Expand Up @@ -5585,9 +5589,25 @@ def select(
│ 10 │
└─────────┘
Note that, when using kwargs syntax, expressions with multiple
outputs are automatically instantiated as Struct columns:
>>> from polars.datatypes import INTEGER_DTYPES
>>> df.select(is_odd=(pl.col(INTEGER_DTYPES) % 2).suffix("_is_odd"))
shape: (3, 1)
┌───────────┐
│ is_odd │
│ --- │
│ struct[2] │
╞═══════════╡
│ {1,0} │
│ {0,1} │
│ {1,0} │
└───────────┘
"""
return self._from_pydf(
self.lazy().select(exprs).collect(no_optimization=True)._df
self.lazy().select(exprs, **named_exprs).collect(no_optimization=True)._df
)

def with_columns(
Expand All @@ -5596,19 +5616,19 @@ def with_columns(
**named_exprs: Any,
) -> DataFrame:
"""
Return a new DataFrame with the columns added, if new, or replaced.
Return a new DataFrame with the columns added (if new), or replaced.
Notes
-----
Creating a new DataFrame using this method does not create a new copy of
existing data.
Creating a new DataFrame using this method does not create a new copy
of existing data.
Parameters
----------
exprs
List of Expressions that evaluate to columns.
List of expressions that evaluate to columns.
**named_exprs
Named column Expressions, provided as kwargs.
Named column expressions, provided as kwargs.
Examples
--------
Expand All @@ -5620,7 +5640,7 @@ def with_columns(
... }
... )
Passing in a single expression, adding the column as we give it a new name:
Passing in a single expression, adding (and naming) a new column:
>>> df.with_columns((pl.col("a") ** 2).alias("a^2"))
shape: (4, 4)
Expand All @@ -5635,8 +5655,8 @@ def with_columns(
│ 4 ┆ 13.0 ┆ true ┆ 16.0 │
└─────┴──────┴───────┴──────┘
We can also override a column, by giving the expression a name that already
exists:
We can also override an existing column by giving the expression
a name that already exists:
>>> df.with_columns((pl.col("a") ** 2).alias("c"))
shape: (4, 3)
Expand All @@ -5651,7 +5671,7 @@ def with_columns(
│ 4 ┆ 13.0 ┆ 16.0 │
└─────┴──────┴──────┘
Passing in multiple expressions as a list:
Multiple expressions can be passed in as both a list...
>>> df.with_columns(
... [
Expand All @@ -5672,17 +5692,15 @@ def with_columns(
│ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
└─────┴──────┴───────┴──────┴──────┴───────┘
Support for kwarg expressions is considered EXPERIMENTAL. Currently
requires opt-in via `pl.Config` boolean flag:
...or via kwarg expressions:
>>> pl.Config.with_columns_kwargs = True
>>> df.with_columns(
... d=pl.col("a") * pl.col("b"),
... e=pl.col("c").is_not(),
... ab=pl.col("a") * pl.col("b"),
... not_c=pl.col("c").is_not(),
... )
shape: (4, 5)
┌─────┬──────┬───────┬──────┬───────┐
│ a ┆ b ┆ c ┆ d ┆ e
│ a ┆ b ┆ c ┆ ab ┆ not_c
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │
╞═════╪══════╪═══════╪══════╪═══════╡
Expand All @@ -5692,6 +5710,24 @@ def with_columns(
│ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
└─────┴──────┴───────┴──────┴───────┘
Note that, when using kwargs syntax, expressions with multiple
outputs are automatically instantiated as Struct columns:
>>> df.drop("c").with_columns(
... diffs=pl.col(["a", "b"]).diff().suffix("_diff"),
... )
shape: (4, 3)
┌─────┬──────┬─────────────┐
│ a ┆ b ┆ diffs │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ struct[2] │
╞═════╪══════╪═════════════╡
│ 1 ┆ 0.5 ┆ {null,null} │
│ 2 ┆ 4.0 ┆ {1,3.5} │
│ 3 ┆ 10.0 ┆ {1,6.0} │
│ 4 ┆ 13.0 ┆ {1,3.0} │
└─────┴──────┴─────────────┘
"""
return (
self.lazy().with_columns(exprs, **named_exprs).collect(no_optimization=True)
Expand Down
30 changes: 20 additions & 10 deletions py-polars/polars/internals/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def expr_to_lit_or_expr(
| Sequence[int | float | str | None]
),
str_to_lit: bool = True,
structify: bool = False,
) -> Expr:
"""
Convert args to expressions.
Expand All @@ -99,34 +100,43 @@ def expr_to_lit_or_expr(
expr
Any argument.
str_to_lit
If True string argument `"foo"` will be converted to `lit("foo")`.
If False it will be converted to `col("foo")`.
If True string argument `"foo"` will be converted to `lit("foo")`,
If False it will be converted to `col("foo")`
structify
If the final unaliased expression has multiple output names,
automagically convert it to struct
Returns
-------
Expr
"""
if isinstance(expr, str) and not str_to_lit:
return pli.col(expr)
expr = pli.col(expr)
elif (
isinstance(expr, (int, float, str, pli.Series, datetime, date, time, timedelta))
or expr is None
):
return pli.lit(expr)
elif isinstance(expr, Expr):
return expr
expr = pli.lit(expr)
structify = False
elif isinstance(expr, list):
return pli.lit(pli.Series("", [expr]))
expr = pli.lit(pli.Series("", [expr]))
structify = False
elif isinstance(expr, (pli.WhenThen, pli.WhenThenThen)):
# implicitly add the null branch.
return expr.otherwise(None)
else:
expr = expr.otherwise(None) # implicitly add the null branch.
elif not isinstance(expr, Expr):
raise ValueError(
f"did not expect value {expr} of type {type(expr)}, maybe disambiguate with"
" pl.lit or pl.col"
)

if structify:
unaliased_expr = expr.meta.undo_aliases()
if unaliased_expr.meta.has_multiple_outputs():
expr = cast(Expr, pli.struct(expr))

return expr


def wrap_expr(pyexpr: PyExpr) -> Expr:
return Expr._from_pyexpr(pyexpr)
Expand Down
5 changes: 4 additions & 1 deletion py-polars/polars/internals/lazy_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2247,6 +2247,7 @@ def collect_all(

def select(
exprs: str | pli.Expr | Sequence[str | pli.Expr] | pli.Series,
**named_exprs: Any,
) -> pli.DataFrame:
"""
Run polars expressions without a context.
Expand All @@ -2257,6 +2258,8 @@ def select(
----------
exprs
Expressions to run
**named_exprs
Named expressions, provided as kwargs.
Returns
-------
Expand All @@ -2283,7 +2286,7 @@ def select(
└─────┘
"""
return pli.DataFrame([]).select(exprs)
return pli.DataFrame([]).select(exprs, **named_exprs)


@overload
Expand Down
Loading

0 comments on commit fa3f48c

Please sign in to comment.