Skip to content

Commit

Permalink
Make DataFrame.any_rowwise top-level, rename to _horizontal (#324)
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli authored Nov 22, 2023
1 parent 2623018 commit 27d5fc4
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 93 deletions.
118 changes: 117 additions & 1 deletion spec/API_specification/dataframe_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""Function stubs and API documentation for the DataFrame API standard."""
from __future__ import annotations

from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, Literal

from .column_object import Column
from .dataframe_object import DataFrame
Expand Down Expand Up @@ -300,3 +300,119 @@ def date(year: int, month: int, day: int) -> Scalar:
... )
>>> df.filter(mask)
"""


def any_horizontal(*columns: Column, skip_nulls: bool = True) -> Column:
"""Reduction returns a Column.
Differs from :meth:`DataFrame.any` in that the reduction happens
for each row, rather than for each column.
All the `columns` must have the same parent DataFrame.
The return value has the same parent DataFrame as the input columns.
Raises
------
ValueError
If any of the columns is not boolean.
Examples
--------
>>> df: DataFrame
>>> ns = df.__dataframe_namespace__()
>>> mask = ns.any_horizontal(
... *[df.col(col_name) > 0 for col_name in df.column_names()]
... )
>>> df = df.filter(mask)
"""
...


def all_horizontal(*columns: Column, skip_nulls: bool = True) -> Column:
"""Reduction returns a Column.
Differs from :meth:`DataFrame.all` in that the reduction happens
for each row, rather than for each column.
All the `columns` must have the same parent DataFrame.
The return value has the same parent DataFrame as the input columns.
Raises
------
ValueError
If any of the columns is not boolean.
Examples
--------
>>> df: DataFrame
>>> ns = df.__dataframe_namespace__()
>>> mask = ns.all_horizontal(
... *[df.col(col_name) > 0 for col_name in df.column_names()]
... )
>>> df = df.filter(mask)
"""
...


def sorted_indices(
*columns: Column,
ascending: Sequence[bool] | bool = True,
nulls_position: Literal["first", "last"] = "last",
) -> Column:
"""Return row numbers which would sort according to given columns.
If you need to sort the DataFrame, use :meth:`sort`.
Parameters
----------
*columns : Column
Columns to sort by.
ascending : Sequence[bool] or bool
If `True`, sort by all keys in ascending order.
If `False`, sort by all keys in descending order.
If a sequence, it must be the same length as `keys`,
and determines the direction with which to use each
key to sort by.
nulls_position : ``{'first', 'last'}``
Whether null values should be placed at the beginning
or at the end of the result.
Note that the position of NaNs is unspecified and may
vary based on the implementation.
Returns
-------
Column
The return value has the same parent DataFrame as the input columns.
Raises
------
ValueError
If `keys` and `ascending` are sequences of different lengths.
"""
...


def unique_indices(*columns: Column, skip_nulls: bool = True) -> Column:
"""Return indices corresponding to unique values across selected columns.
Parameters
----------
*columns : Column
Column names to consider when finding unique values.
Returns
-------
Column
Indices corresponding to unique values.
Notes
-----
There are no ordering guarantees. In particular, if there are multiple
indices corresponding to the same unique value(s), there is no guarantee
about which one will appear in the result.
If the original column(s) contain multiple `'NaN'` values, then
only a single index corresponding to those values will be returned.
Likewise for null values (if ``skip_nulls=False``).
To get the unique values, you can do ``df.get_rows(df.unique_indices(keys))``.
"""
...
2 changes: 1 addition & 1 deletion spec/API_specification/dataframe_api/column_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def sort(
"""Sort column.
If you need the indices which would sort the column,
use :meth:`sorted_indices`.
use `sorted_indices`.
Parameters
----------
Expand Down
92 changes: 1 addition & 91 deletions spec/API_specification/dataframe_api/dataframe_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def sort(
"""Sort dataframe according to given columns.
If you only need the indices which would sort the dataframe, use
:meth:`sorted_indices`.
`sorted_indices`.
Parameters
----------
Expand Down Expand Up @@ -314,44 +314,6 @@ def sort(
"""
...

def sorted_indices(
self,
*keys: str,
ascending: Sequence[bool] | bool = True,
nulls_position: Literal["first", "last"] = "last",
) -> Column:
"""Return row numbers which would sort according to given columns.
If you need to sort the DataFrame, use :meth:`sort`.
Parameters
----------
*keys : str
Names of columns to sort by.
If not specified, sort by all columns.
ascending : Sequence[bool] or bool
If `True`, sort by all keys in ascending order.
If `False`, sort by all keys in descending order.
If a sequence, it must be the same length as `keys`,
and determines the direction with which to use each
key to sort by.
nulls_position : ``{'first', 'last'}``
Whether null values should be placed at the beginning
or at the end of the result.
Note that the position of NaNs is unspecified and may
vary based on the implementation.
Returns
-------
Column
Raises
------
ValueError
If `keys` and `ascending` are sequences of different lengths.
"""
...

def __eq__(self, other: AnyScalar) -> Self: # type: ignore[override]
"""Compare for equality.
Expand Down Expand Up @@ -678,32 +640,6 @@ def all(self, *, skip_nulls: bool | Scalar = True) -> Self:
"""
...

def any_rowwise(self, *, skip_nulls: bool | Scalar = True) -> Column:
"""Reduction returns a Column.
Differs from ``DataFrame.any`` and that the reduction happens
for each row, rather than for each column.
Raises
------
ValueError
If any of the DataFrame's columns is not boolean.
"""
...

def all_rowwise(self, *, skip_nulls: bool | Scalar = True) -> Column:
"""Reduction returns a Column.
Differs from ``DataFrame.all`` and that the reduction happens
for each row, rather than for each column.
Raises
------
ValueError
If any of the DataFrame's columns is not boolean.
"""
...

def min(self, *, skip_nulls: bool | Scalar = True) -> Self:
"""Reduction returns a 1-row DataFrame."""
...
Expand Down Expand Up @@ -804,32 +740,6 @@ def is_nan(self) -> Self:
"""
...

def unique_indices(self, *keys: str, skip_nulls: bool | Scalar = True) -> Column:
"""Return indices corresponding to unique values across selected columns.
Parameters
----------
*keys : str
Column names to consider when finding unique values.
If not specified, all columns are considered.
Returns
-------
Column
Indices corresponding to unique values.
Notes
-----
There are no ordering guarantees. In particular, if there are multiple
indices corresponding to the same unique value(s), there is no guarantee
about which one will appear in the result.
If the original column(s) contain multiple `'NaN'` values, then
only a single index corresponding to those values will be returned.
Likewise for null values (if ``skip_nulls=False``).
To get the unique values, you can do ``df.get_rows(df.unique_indices(keys))``.
"""
...

def fill_nan(self, value: float | NullType | Scalar, /) -> Self:
"""Fill ``nan`` values with the given fill value.
Expand Down
29 changes: 29 additions & 0 deletions spec/API_specification/dataframe_api/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,35 @@ def is_dtype(self, dtype: DType, kind: str | tuple[str, ...]) -> bool:
def date(self, year: int, month: int, day: int) -> Scalar:
...

def any_horizontal(
self,
*columns: Column,
skip_nulls: bool = True,
) -> Column:
...

def all_horizontal(
self,
*columns: Column,
skip_nulls: bool = True,
) -> Column:
...

def sorted_indices(
self,
*columns: Column,
ascending: Sequence[bool] | bool = True,
nulls_position: Literal["first", "last"] = "last",
) -> Column:
...

def unique_indices(
self,
*columns: Column,
skip_nulls: bool = True,
) -> Column:
...


DType = Union[
Namespace.Bool,
Expand Down
28 changes: 28 additions & 0 deletions spec/API_specification/examples/06_horizontal_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Example of how to use a horizontal function.
Horizontal functions are functions that take multiple columns as input and return a
single column as output.
Examples include:
- `any_horizontal`
- `all_horizontal`
These can be accessed by first using ``__dataframe_namespace__`` to get the
namespace object, and then calling the function on the namespace object and passing
an iterable of ``Column``s as input.
"""
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from dataframe_api.typing import SupportsDataFrameAPI


def main(df_raw: SupportsDataFrameAPI) -> SupportsDataFrameAPI:
df = df_raw.__dataframe_consortium_standard__(api_version="2023-11.beta")
ns = df.__dataframe_namespace__()
df = df.filter(
ns.any_horizontal(*[df.col(col_name) > 0 for col_name in df.column_names]),
)
return df.dataframe

0 comments on commit 27d5fc4

Please sign in to comment.