Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): More ergonomic sort args #6896

Merged
merged 1 commit into from
Feb 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 78 additions & 59 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,6 @@
ParallelStrategy,
ParquetCompression,
PivotAgg,
PolarsExprType,
PythonLiteral,
RollingInterpolationMethod,
SizeUnit,
StartBy,
Expand Down Expand Up @@ -3060,71 +3058,99 @@ def replace_at_idx(self, index: int, series: pli.Series) -> Self:

def sort(
self,
by: str | pli.Expr | Sequence[str] | Sequence[pli.Expr],
*,
reverse: bool | list[bool] = False,
by: IntoExpr | Iterable[IntoExpr],
*more_by: IntoExpr,
reverse: bool | Sequence[bool] = False,
nulls_last: bool = False,
) -> Self:
"""
Sort the DataFrame by column.
Sort the dataframe by the given columns.

Parameters
----------
by
By which column to sort. Only accepts string.
Column(s) to sort by. Accepts expression input. Strings are parsed as column
names.
*more_by
Additional columns to sort by, specified as positional arguments.
reverse
Reverse/descending sort.
Sort in descending order. When sorting by multiple columns, can be specified
per column by passing a sequence of booleans.
nulls_last
Place null values last. Can only be used if sorted by a single column.
Place null values last. Can only be used when sorting by a single column.

Examples
--------
Pass a single column name to sort by that column.

>>> df = pl.DataFrame(
... {
... "foo": [1, 2, 3],
... "bar": [6.0, 7.0, 8.0],
... "ham": ["a", "b", "c"],
... "a": [1, 2, None],
... "b": [6.0, 5.0, 4.0],
... "c": ["a", "c", "b"],
... }
... )
>>> df.sort("foo", reverse=True)
>>> df.sort("a")
shape: (3, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ str │
╞═════╪═════╪═════╡
│ 3 ┆ 8.0 ┆ c │
│ 2 ┆ 7.0 ┆ b │
│ 1 ┆ 6.0 ┆ a │
└─────┴─────┴─────┘

**Sort by multiple columns.**
For multiple columns we can also use expression syntax.

>>> df.sort(
... [pl.col("foo"), pl.col("bar") ** 2],
... reverse=[True, False],
... )
┌──────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ str │
╞══════╪═════╪═════╡
│ null ┆ 4.0 ┆ b │
│ 1 ┆ 6.0 ┆ a │
│ 2 ┆ 5.0 ┆ c │
└──────┴─────┴─────┘

Sorting by expressions is also supported.

>>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True)
shape: (3, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ str │
╞═════╪═════╪═════╡
│ 3 ┆ 8.0 ┆ c │
│ 2 ┆ 7.0 ┆ b │
│ 1 ┆ 6.0 ┆ a │
└─────┴─────┴─────┘
┌──────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ str │
╞══════╪═════╪═════╡
│ 2 ┆ 5.0 ┆ c │
│ 1 ┆ 6.0 ┆ a │
│ null ┆ 4.0 ┆ b │
└──────┴─────┴─────┘

Sort by multiple columns by passing a list of columns.

>>> df.sort(["c", "a"], reverse=True)
shape: (3, 3)
┌──────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ str │
╞══════╪═════╪═════╡
│ 2 ┆ 5.0 ┆ c │
│ null ┆ 4.0 ┆ b │
│ 1 ┆ 6.0 ┆ a │
└──────┴─────┴─────┘

Or use positional arguments to sort by multiple columns in the same way.

>>> df.sort("c", "a", reverse=[False, True])
shape: (3, 3)
┌──────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ str │
╞══════╪═════╪═════╡
│ 1 ┆ 6.0 ┆ a │
│ null ┆ 4.0 ┆ b │
│ 2 ┆ 5.0 ┆ c │
└──────┴─────┴─────┘

"""
if not isinstance(by, str) and isinstance(by, (Sequence, pli.Expr)):
df = (
self.lazy()
.sort(by, reverse=reverse, nulls_last=nulls_last)
.collect(no_optimization=True)
)
return self._from_pydf(df._df)
return self._from_pydf(self._df.sort(by, reverse, nulls_last))
return self._from_pydf(
self.lazy()
.sort(by, *more_by, reverse=reverse, nulls_last=nulls_last)
.collect(no_optimization=True)
._df
)

def frame_equal(self, other: DataFrame, null_equal: bool = True) -> bool:
"""
Expand Down Expand Up @@ -5731,25 +5757,18 @@ def lazy(self) -> pli.LazyFrame:

def select(
self,
exprs: (
str
| PolarsExprType
| PythonLiteral
| pli.Series
| Iterable[str | PolarsExprType | PythonLiteral | pli.Series | None]
| None
) = None,
*more_exprs: str | PolarsExprType | PythonLiteral | pli.Series | None,
**named_exprs: str | PolarsExprType | PythonLiteral | pli.Series | None,
exprs: IntoExpr | Iterable[IntoExpr] | None = None,
*more_exprs: IntoExpr,
**named_exprs: IntoExpr,
) -> Self:
"""
Select columns from this DataFrame.

Parameters
----------
exprs
Column or columns to select. Accepts expression input. Strings are parsed
as column names, other non-expression inputs are parsed as literals.
Column(s) to select. Accepts expression input. Strings are parsed as column
names, other non-expression inputs are parsed as literals.
*more_exprs
Additional columns to select, specified as positional arguments.
**named_exprs
Expand Down
Loading