diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 5db4caea..9ae7de4a 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -5,9 +5,14 @@ if TYPE_CHECKING: from typing_extensions import Self - from dataframe_api.dataframe_object import DataFrame - - from .typing import DType, Namespace, NullType, Scalar + from .typing import ( + AnyScalar, + DataFrame, + DType, + Namespace, + NullType, + Scalar, + ) __all__ = ["Column"] @@ -224,7 +229,7 @@ def sorted_indices( """ ... - def __eq__(self, other: Self | Scalar) -> Self: # type: ignore[override] + def __eq__(self, other: Self | AnyScalar) -> Self: # type: ignore[override] """Compare for equality. Nulls should follow Kleene Logic. @@ -247,7 +252,7 @@ def __eq__(self, other: Self | Scalar) -> Self: # type: ignore[override] """ ... - def __ne__(self, other: Self | Scalar) -> Self: # type: ignore[override] + def __ne__(self, other: Self | AnyScalar) -> Self: # type: ignore[override] """Compare for non-equality. Nulls should follow Kleene Logic. @@ -270,7 +275,7 @@ def __ne__(self, other: Self | Scalar) -> Self: # type: ignore[override] """ ... - def __ge__(self, other: Self | Scalar) -> Self: + def __ge__(self, other: Self | AnyScalar) -> Self: """Compare for "greater than or equal to" `other`. Parameters @@ -291,7 +296,7 @@ def __ge__(self, other: Self | Scalar) -> Self: """ ... - def __gt__(self, other: Self | Scalar) -> Self: + def __gt__(self, other: Self | AnyScalar) -> Self: """Compare for "greater than" `other`. Parameters @@ -312,7 +317,7 @@ def __gt__(self, other: Self | Scalar) -> Self: """ ... - def __le__(self, other: Self | Scalar) -> Self: + def __le__(self, other: Self | AnyScalar) -> Self: """Compare for "less than or equal to" `other`. Parameters @@ -333,7 +338,7 @@ def __le__(self, other: Self | Scalar) -> Self: """ ... - def __lt__(self, other: Self | Scalar) -> Self: + def __lt__(self, other: Self | AnyScalar) -> Self: """Compare for "less than" `other`. Parameters @@ -354,7 +359,7 @@ def __lt__(self, other: Self | Scalar) -> Self: """ ... - def __and__(self, other: Self | bool) -> Self: + def __and__(self, other: Self | bool | Scalar) -> Self: """Apply logical 'and' to `other` Column (or scalar) and this Column. Nulls should follow Kleene Logic. @@ -380,7 +385,7 @@ def __and__(self, other: Self | bool) -> Self: """ ... - def __or__(self, other: Self | bool) -> Self: + def __or__(self, other: Self | bool | Scalar) -> Self: """Apply logical 'or' to `other` Column (or scalar) and this column. Nulls should follow Kleene Logic. @@ -406,7 +411,7 @@ def __or__(self, other: Self | bool) -> Self: """ ... - def __add__(self, other: Self | Scalar) -> Self: + def __add__(self, other: Self | AnyScalar) -> Self: """Add `other` column or scalar to this column. Parameters @@ -427,7 +432,7 @@ def __add__(self, other: Self | Scalar) -> Self: """ ... - def __sub__(self, other: Self | Scalar) -> Self: + def __sub__(self, other: Self | AnyScalar) -> Self: """Subtract `other` column or scalar from this column. Parameters @@ -448,7 +453,7 @@ def __sub__(self, other: Self | Scalar) -> Self: """ ... - def __mul__(self, other: Self | Scalar) -> Self: + def __mul__(self, other: Self | AnyScalar) -> Self: """Multiply `other` column or scalar with this column. Parameters @@ -469,7 +474,7 @@ def __mul__(self, other: Self | Scalar) -> Self: """ ... - def __truediv__(self, other: Self | Scalar) -> Self: + def __truediv__(self, other: Self | AnyScalar) -> Self: """Divide this column by `other` column or scalar. True division, returns floats. Parameters @@ -490,7 +495,7 @@ def __truediv__(self, other: Self | Scalar) -> Self: """ ... - def __floordiv__(self, other: Self | Scalar) -> Self: + def __floordiv__(self, other: Self | AnyScalar) -> Self: """Floor-divide `other` column or scalar to this column. Parameters @@ -511,7 +516,7 @@ def __floordiv__(self, other: Self | Scalar) -> Self: """ ... - def __pow__(self, other: Self | Scalar) -> Self: + def __pow__(self, other: Self | AnyScalar) -> Self: """Raise this column to the power of `other`. Integer dtype to the power of non-negative integer dtype is integer dtype. @@ -536,7 +541,7 @@ def __pow__(self, other: Self | Scalar) -> Self: """ ... - def __mod__(self, other: Self | Scalar) -> Self: + def __mod__(self, other: Self | AnyScalar) -> Self: """Return modulus of this column by `other` (`%` operator). Parameters @@ -557,7 +562,7 @@ def __mod__(self, other: Self | Scalar) -> Self: """ ... - def __divmod__(self, other: Self | Scalar) -> tuple[Column, Column]: + def __divmod__(self, other: Self | AnyScalar) -> tuple[Column, Column]: """Return quotient and remainder of integer division. See `divmod` builtin. Parameters @@ -578,16 +583,16 @@ def __divmod__(self, other: Self | Scalar) -> tuple[Column, Column]: """ ... - def __radd__(self, other: Self | Scalar) -> Self: + def __radd__(self, other: Self | AnyScalar) -> Self: ... - def __rsub__(self, other: Self | Scalar) -> Self: + def __rsub__(self, other: Self | AnyScalar) -> Self: ... - def __rmul__(self, other: Self | Scalar) -> Self: + def __rmul__(self, other: Self | AnyScalar) -> Self: ... - def __rtruediv__(self, other: Self | Scalar) -> Self: + def __rtruediv__(self, other: Self | AnyScalar) -> Self: ... def __rand__(self, other: Self | bool) -> Self: @@ -596,13 +601,13 @@ def __rand__(self, other: Self | bool) -> Self: def __ror__(self, other: Self | bool) -> Self: ... - def __rfloordiv__(self, other: Self | Scalar) -> Self: + def __rfloordiv__(self, other: Self | AnyScalar) -> Self: ... - def __rpow__(self, other: Self | Scalar) -> Self: + def __rpow__(self, other: Self | AnyScalar) -> Self: ... - def __rmod__(self, other: Self | Scalar) -> Self: + def __rmod__(self, other: Self | AnyScalar) -> Self: ... def __invert__(self) -> Self: @@ -615,7 +620,7 @@ def __invert__(self) -> Self: """ ... - def any(self, *, skip_nulls: bool = True) -> bool | NullType: + def any(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a bool. Raises @@ -625,7 +630,7 @@ def any(self, *, skip_nulls: bool = True) -> bool | NullType: """ ... - def all(self, *, skip_nulls: bool = True) -> bool | NullType: + def all(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a bool. Raises @@ -635,7 +640,7 @@ def all(self, *, skip_nulls: bool = True) -> bool | NullType: """ ... - def min(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def min(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a scalar. Any data type that supports comparisons @@ -643,7 +648,7 @@ def min(self, *, skip_nulls: bool = True) -> Scalar | NullType: """ ... - def max(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def max(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a scalar. Any data type that supports comparisons @@ -651,7 +656,7 @@ def max(self, *, skip_nulls: bool = True) -> Scalar | NullType: """ ... - def sum(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def sum(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -660,7 +665,7 @@ def sum(self, *, skip_nulls: bool = True) -> Scalar | NullType: """ ... - def prod(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def prod(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical data types. @@ -668,7 +673,7 @@ def prod(self, *, skip_nulls: bool = True) -> Scalar | NullType: """ ... - def median(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def median(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -678,7 +683,7 @@ def median(self, *, skip_nulls: bool = True) -> Scalar | NullType: """ ... - def mean(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def mean(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -691,9 +696,9 @@ def mean(self, *, skip_nulls: bool = True) -> Scalar | NullType: def std( self, *, - correction: int | float = 1, - skip_nulls: bool = True, - ) -> Scalar | NullType: + correction: float = 1, + skip_nulls: bool | Scalar = True, + ) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -724,9 +729,9 @@ def std( def var( self, *, - correction: int | float = 1, - skip_nulls: bool = True, - ) -> Scalar | NullType: + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, + ) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -835,7 +840,7 @@ def is_in(self, values: Self) -> Self: """ ... - def unique_indices(self, *, skip_nulls: bool = True) -> Self: + def unique_indices(self, *, skip_nulls: bool | Scalar = True) -> Self: """Return indices corresponding to unique values in Column. Returns @@ -855,7 +860,7 @@ def unique_indices(self, *, skip_nulls: bool = True) -> Self: """ ... - def fill_nan(self, value: float | NullType, /) -> Self: + def fill_nan(self, value: float | NullType | Scalar, /) -> Self: """Fill floating point ``nan`` values with the given fill value. Parameters @@ -868,7 +873,7 @@ def fill_nan(self, value: float | NullType, /) -> Self: """ ... - def fill_null(self, value: Scalar, /) -> Self: + def fill_null(self, value: AnyScalar, /) -> Self: """Fill null values with the given fill value. Parameters @@ -914,7 +919,7 @@ def to_array(self) -> Any: """ ... - def rename(self, name: str) -> Self: + def rename(self, name: str | Scalar) -> Self: """Rename column. Parameters @@ -929,17 +934,17 @@ def rename(self, name: str) -> Self: """ ... - def shift(self, offset: int) -> Self: + def shift(self, offset: int | Scalar) -> Self: """Shift values by `offset` positions, filling missing values with `null`. For example, if the original column contains values `[1, 4, 2]`, then: - `.shift(1)` will return `[null, 1, 4]`, - `.shift(-1)` will return `[4, 2, null]`, - + Parameters ---------- - offset + offset : int How many positions to shift by. """ ... @@ -1020,7 +1025,7 @@ def iso_weekday(self) -> Self: """ ... - def unix_timestamp(self, *, time_unit: Literal["s", "ms", "us"] = "s") -> Self: + def unix_timestamp(self, *, time_unit: str | Scalar = "s") -> Self: """Return number of seconds / milliseconds / microseconds since the Unix epoch. The Unix epoch is 00:00:00 UTC on 1 January 1970. @@ -1039,3 +1044,16 @@ def unix_timestamp(self, *, time_unit: Literal["s", "ms", "us"] = "s") -> Self: discarded. """ ... + + def persist(self) -> Self: + """Hint that computation prior to this point should not be repeated. + + This is intended as a hint, rather than as a directive. Implementations + which do not separate lazy vs eager execution may ignore this method and + treat it as a no-op. + + .. note:: + This method may trigger execution. If necessary, it should be called + at most once per dataframe, and as late as possible in the pipeline. + """ + ... diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 462fc911..4bd431ea 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -9,7 +9,14 @@ from .column_object import Column from .groupby_object import GroupBy - from .typing import DType, Namespace, NullType, Scalar, SupportsDataFrameAPI + from .typing import ( + AnyScalar, + DType, + Namespace, + NullType, + Scalar, + SupportsDataFrameAPI, + ) __all__ = ["DataFrame"] @@ -345,7 +352,7 @@ def sorted_indices( """ ... - def __eq__(self, other: Scalar) -> Self: # type: ignore[override] + def __eq__(self, other: AnyScalar) -> Self: # type: ignore[override] """Compare for equality. Nulls should follow Kleene Logic. @@ -362,7 +369,7 @@ def __eq__(self, other: Scalar) -> Self: # type: ignore[override] """ ... - def __ne__(self, other: Scalar) -> Self: # type: ignore[override] + def __ne__(self, other: AnyScalar) -> Self: # type: ignore[override] """Compare for non-equality. Nulls should follow Kleene Logic. @@ -379,7 +386,7 @@ def __ne__(self, other: Scalar) -> Self: # type: ignore[override] """ ... - def __ge__(self, other: Scalar) -> Self: + def __ge__(self, other: AnyScalar) -> Self: """Compare for "greater than or equal to" `other`. Parameters @@ -394,7 +401,7 @@ def __ge__(self, other: Scalar) -> Self: """ ... - def __gt__(self, other: Scalar) -> Self: + def __gt__(self, other: AnyScalar) -> Self: """Compare for "greater than" `other`. Parameters @@ -409,7 +416,7 @@ def __gt__(self, other: Scalar) -> Self: """ ... - def __le__(self, other: Scalar) -> Self: + def __le__(self, other: AnyScalar) -> Self: """Compare for "less than or equal to" `other`. Parameters @@ -424,7 +431,7 @@ def __le__(self, other: Scalar) -> Self: """ ... - def __lt__(self, other: Scalar) -> Self: + def __lt__(self, other: AnyScalar) -> Self: """Compare for "less than" `other`. Parameters @@ -479,7 +486,7 @@ def __or__(self, other: bool) -> Self: # noqa: FBT001 """ ... - def __add__(self, other: Scalar) -> Self: + def __add__(self, other: AnyScalar) -> Self: """Add `other` scalar to this dataframe. Parameters @@ -494,7 +501,7 @@ def __add__(self, other: Scalar) -> Self: """ ... - def __sub__(self, other: Scalar) -> Self: + def __sub__(self, other: AnyScalar) -> Self: """Subtract `other` scalar from this dataframe. Parameters @@ -509,7 +516,7 @@ def __sub__(self, other: Scalar) -> Self: """ ... - def __mul__(self, other: Scalar) -> Self: + def __mul__(self, other: AnyScalar) -> Self: """Multiply `other` scalar with this dataframe. Parameters @@ -524,7 +531,7 @@ def __mul__(self, other: Scalar) -> Self: """ ... - def __truediv__(self, other: Scalar) -> Self: + def __truediv__(self, other: AnyScalar) -> Self: """Divide this dataframe by `other` scalar. True division, returns floats. Parameters @@ -539,7 +546,7 @@ def __truediv__(self, other: Scalar) -> Self: """ ... - def __floordiv__(self, other: Scalar) -> Self: + def __floordiv__(self, other: AnyScalar) -> Self: """Floor-divide (returns integers) this dataframe by `other` scalar. Parameters @@ -554,7 +561,7 @@ def __floordiv__(self, other: Scalar) -> Self: """ ... - def __pow__(self, other: Scalar) -> Self: + def __pow__(self, other: AnyScalar) -> Self: """Raise this dataframe to the power of `other`. Integer dtype to the power of non-negative integer dtype is integer dtype. @@ -573,7 +580,7 @@ def __pow__(self, other: Scalar) -> Self: """ ... - def __mod__(self, other: Scalar) -> Self: + def __mod__(self, other: AnyScalar) -> Self: """Return modulus of this dataframe by `other` (`%` operator). Parameters @@ -588,7 +595,7 @@ def __mod__(self, other: Scalar) -> Self: """ ... - def __divmod__(self, other: Scalar) -> tuple[DataFrame, DataFrame]: + def __divmod__(self, other: AnyScalar) -> tuple[DataFrame, DataFrame]: """Return quotient and remainder of integer division. See `divmod` builtin. Parameters @@ -603,31 +610,31 @@ def __divmod__(self, other: Scalar) -> tuple[DataFrame, DataFrame]: """ ... - def __radd__(self, other: Scalar) -> Self: + def __radd__(self, other: AnyScalar) -> Self: ... - def __rsub__(self, other: Scalar) -> Self: + def __rsub__(self, other: AnyScalar) -> Self: ... - def __rmul__(self, other: Scalar) -> Self: + def __rmul__(self, other: AnyScalar) -> Self: ... - def __rtruediv__(self, other: Scalar) -> Self: + def __rtruediv__(self, other: AnyScalar) -> Self: ... - def __rand__(self, other: Scalar) -> Self: + def __rand__(self, other: AnyScalar) -> Self: ... - def __ror__(self, other: Scalar) -> Self: + def __ror__(self, other: AnyScalar) -> Self: ... - def __rfloordiv__(self, other: Scalar) -> Self: + def __rfloordiv__(self, other: AnyScalar) -> Self: ... - def __rpow__(self, other: Scalar) -> Self: + def __rpow__(self, other: AnyScalar) -> Self: ... - def __rmod__(self, other: Scalar) -> Self: + def __rmod__(self, other: AnyScalar) -> Self: ... def __invert__(self) -> Self: @@ -651,7 +658,7 @@ def __iter__(self) -> NoReturn: """ raise NotImplementedError("'__iter__' is intentionally not implemented.") - def any(self, *, skip_nulls: bool = True) -> Self: + def any(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame. Raises @@ -661,7 +668,7 @@ def any(self, *, skip_nulls: bool = True) -> Self: """ ... - def all(self, *, skip_nulls: bool = True) -> Self: + def all(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame. Raises @@ -671,7 +678,7 @@ def all(self, *, skip_nulls: bool = True) -> Self: """ ... - def any_rowwise(self, *, skip_nulls: bool = True) -> Column: + def any_rowwise(self, *, skip_nulls: bool | Scalar = True) -> Column: """Reduction returns a Column. Differs from ``DataFrame.any`` and that the reduction happens @@ -684,7 +691,7 @@ def any_rowwise(self, *, skip_nulls: bool = True) -> Column: """ ... - def all_rowwise(self, *, skip_nulls: bool = True) -> Column: + def all_rowwise(self, *, skip_nulls: bool | Scalar = True) -> Column: """Reduction returns a Column. Differs from ``DataFrame.all`` and that the reduction happens @@ -697,31 +704,36 @@ def all_rowwise(self, *, skip_nulls: bool = True) -> Column: """ ... - def min(self, *, skip_nulls: bool = True) -> Self: + def min(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def max(self, *, skip_nulls: bool = True) -> Self: + def max(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def sum(self, *, skip_nulls: bool = True) -> Self: + def sum(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def prod(self, *, skip_nulls: bool = True) -> Self: + def prod(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def median(self, *, skip_nulls: bool = True) -> Self: + def median(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def mean(self, *, skip_nulls: bool = True) -> Self: + def mean(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Self: + def std( + self, + *, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, + ) -> Self: """Reduction returns a 1-row DataFrame. Parameters @@ -735,7 +747,12 @@ def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Self: """ ... - def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Self: + def var( + self, + *, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, + ) -> Self: """Reduction returns a 1-row DataFrame. Parameters @@ -787,7 +804,7 @@ def is_nan(self) -> Self: """ ... - def unique_indices(self, *keys: str, skip_nulls: bool = True) -> Column: + def unique_indices(self, *keys: str, skip_nulls: bool | Scalar = True) -> Column: """Return indices corresponding to unique values across selected columns. Parameters @@ -813,7 +830,7 @@ def unique_indices(self, *keys: str, skip_nulls: bool = True) -> Column: """ ... - def fill_nan(self, value: float | NullType, /) -> Self: + def fill_nan(self, value: float | NullType | Scalar, /) -> Self: """Fill ``nan`` values with the given fill value. The fill operation will apply to all columns with a floating-point @@ -831,7 +848,7 @@ def fill_nan(self, value: float | NullType, /) -> Self: def fill_null( self, - value: Scalar, + value: AnyScalar, /, *, column_names: list[str] | None = None, diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index adecb8aa..8b26eff3 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -4,6 +4,7 @@ if TYPE_CHECKING: from .dataframe_object import DataFrame + from .typing import Scalar __all__ = [ @@ -22,34 +23,44 @@ class GroupBy(Protocol): """ - def any(self, *, skip_nulls: bool = True) -> DataFrame: + def any(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def all(self, *, skip_nulls: bool = True) -> DataFrame: + def all(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def min(self, *, skip_nulls: bool = True) -> DataFrame: + def min(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def max(self, *, skip_nulls: bool = True) -> DataFrame: + def max(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def sum(self, *, skip_nulls: bool = True) -> DataFrame: + def sum(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def prod(self, *, skip_nulls: bool = True) -> DataFrame: + def prod(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def median(self, *, skip_nulls: bool = True) -> DataFrame: + def median(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def mean(self, *, skip_nulls: bool = True) -> DataFrame: + def mean(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> DataFrame: + def std( + self, + *, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, + ) -> DataFrame: ... - def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> DataFrame: + def var( + self, + *, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, + ) -> DataFrame: ... def size(self) -> DataFrame: @@ -74,7 +85,7 @@ def aggregate(self, *aggregation: Aggregation) -> DataFrame: class Aggregation(Protocol): - def rename(self, name: str) -> Aggregation: + def rename(self, name: str | Scalar) -> Aggregation: """Assign given name to output of aggregation. If not called, the column's name will be used as the output name. @@ -82,35 +93,35 @@ def rename(self, name: str) -> Aggregation: ... @classmethod - def any(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def any(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def all(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def all(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def min(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def min(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def max(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def max(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def sum(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def sum(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def prod(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def prod(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def median(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def median(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def mean(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def mean(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod @@ -118,8 +129,8 @@ def std( cls, column: str, *, - correction: int | float = 1, - skip_nulls: bool = True, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, ) -> Aggregation: ... @@ -128,8 +139,8 @@ def var( cls, column: str, *, - correction: int | float = 1, - skip_nulls: bool = True, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, ) -> Aggregation: ... diff --git a/spec/API_specification/dataframe_api/scalar_object.py b/spec/API_specification/dataframe_api/scalar_object.py new file mode 100644 index 00000000..078cf2a4 --- /dev/null +++ b/spec/API_specification/dataframe_api/scalar_object.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Protocol + +if TYPE_CHECKING: + from typing_extensions import Self + + from dataframe_api.typing import AnyScalar, DType + +__all__ = ["Scalar"] + + +class Scalar(Protocol): + """Scalar object. + + Not meant to be instantiated directly, but rather created via + `:meth:Column.get_value` or one of the column reductions such + as `:meth:`Column.sum`. + + Note that, just like how `:class:Column`s can hold null values, + a `Scalar` can also be backed by a null value. Given that `Scalar`s + aren't instantiated directly, but rather derived from existing + `Column`s, `Scalar.dtype` is determined by the parent `Column`. + For example, if `column` is `Column` of dtype `Int64`, then + `column.get_value(0)` will return a `Scalar` of dtype `Int64` + (even if it is backed by a null value). + """ + + def __lt__(self, other: AnyScalar) -> Scalar: + ... + + def __le__(self, other: AnyScalar) -> Scalar: + ... + + def __eq__(self, other: AnyScalar) -> Scalar: # type: ignore[override] + ... + + def __ne__(self, other: AnyScalar) -> Scalar: # type: ignore[override] + ... + + def __gt__(self, other: AnyScalar) -> Scalar: + ... + + def __ge__(self, other: AnyScalar) -> Scalar: + ... + + def __add__(self, other: AnyScalar) -> Scalar: + ... + + def __radd__(self, other: AnyScalar) -> Scalar: + ... + + def __sub__(self, other: AnyScalar) -> Scalar: + ... + + def __rsub__(self, other: AnyScalar) -> Scalar: + ... + + def __mul__(self, other: AnyScalar) -> Scalar: + ... + + def __rmul__(self, other: AnyScalar) -> Scalar: + ... + + def __mod__(self, other: AnyScalar) -> Scalar: + ... + + # Signatures of "__rmod__" of "Scalar" and "__mod__" of "str | int | float | Scalar" + # are unsafely overlapping + def __rmod__(self, other: AnyScalar) -> Scalar: # type: ignore[misc] + ... + + def __pow__(self, other: AnyScalar) -> Scalar: + ... + + def __rpow__(self, other: AnyScalar) -> Scalar: + ... + + def __floordiv__(self, other: AnyScalar) -> Scalar: + ... + + def __rfloordiv__(self, other: AnyScalar) -> Scalar: + ... + + def __truediv__(self, other: AnyScalar) -> Scalar: + ... + + def __rtruediv__(self, other: AnyScalar) -> Scalar: + ... + + def __neg__(self) -> Scalar: + ... + + def __abs__(self) -> Scalar: + ... + + def __bool__(self) -> bool: + """Note that this return a Python scalar. + + Depending on the implementation, this may raise or trigger computation. + """ + ... + + @property + def dtype(self) -> DType: + """Return data type of scalar.""" + ... + + def persist(self) -> Self: + """Hint that computation prior to this point should not be repeated. + + This is intended as a hint, rather than as a directive. Implementations + which do not separate lazy vs eager execution may ignore this method and + treat it as a no-op. + + .. note:: + This may trigger computation and so should be used with care. + See `execution_model` page for more details. + """ + ... diff --git a/spec/API_specification/dataframe_api/typing.py b/spec/API_specification/dataframe_api/typing.py index e344e111..574c9480 100644 --- a/spec/API_specification/dataframe_api/typing.py +++ b/spec/API_specification/dataframe_api/typing.py @@ -14,13 +14,12 @@ from dataframe_api.groupby_object import Aggregation as AggregationT from dataframe_api.groupby_object import GroupBy +from .scalar_object import Scalar + if TYPE_CHECKING: from collections.abc import Sequence -# Type alias: Mypy needs Any, but for readability we need to make clear this -# is a Python scalar (i.e., an instance of `bool`, `int`, `float`, `str`, etc.) -Scalar = Any # null is a special object which represents a missing value. # It is not valid as a type. @@ -131,7 +130,6 @@ def date(self, year: int, month: int, day: int) -> Scalar: ... -NullType = Namespace.NullType DType = Union[ Namespace.Bool, Namespace.Float64, @@ -169,15 +167,20 @@ def __column_consortium_standard__( ... +PythonScalar = Union[str, int, float, bool] +AnyScalar = Union[PythonScalar, Scalar] +NullType = Namespace.NullType + + __all__ = [ "Column", "DataFrame", "DType", "GroupBy", "Namespace", - "NullType", + "AnyScalar", "Scalar", + "NullType", "SupportsColumnAPI", "SupportsDataFrameAPI", - "Scalar", ] diff --git a/spec/API_specification/examples/05_scalars_example.py b/spec/API_specification/examples/05_scalars_example.py new file mode 100644 index 00000000..acf001a8 --- /dev/null +++ b/spec/API_specification/examples/05_scalars_example.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from dataframe_api.column_object import Column + from dataframe_api.dataframe_object import DataFrame + from dataframe_api.scalar_object import Scalar + from dataframe_api.typing import SupportsDataFrameAPI + + +def main(df_raw: SupportsDataFrameAPI) -> SupportsDataFrameAPI: + df = df_raw.__dataframe_consortium_standard__(api_version="2023-11.beta") + + # `DataFrame.fill_null` accepts `AnyScalar` objections. + # This means we can fill nulls using a Standard Scalar object... + df = df.fill_null(df.col("a").mean()) + + # ... but also Python scalars: + df = df.fill_null(3) + df = df.fill_null("3") + + # Scalars can be used in arithmetic expressions with other scalars, columns, + # or DataFrames + value: Scalar = df.col("a").mean() + col: Column = df.col("a") + _res1: Column = value - col + _res2: Scalar = value - 3 + _res3: Scalar = 3 - value + _res4: Column = df.col("a") - 3 + _res5: DataFrame = df - value + _res6: DataFrame = value - df + + return df.dataframe diff --git a/spec/API_specification/pyproject.toml b/spec/API_specification/pyproject.toml index 8b88d455..31c5f2ad 100644 --- a/spec/API_specification/pyproject.toml +++ b/spec/API_specification/pyproject.toml @@ -47,5 +47,4 @@ ignore = [ "N999", # invalid-module-name "PD901", # pandas-df-variable-name "PLR0913", # too-many-arguments - "PYI041", # redundant-numeric-union ] diff --git a/spec/conf.py b/spec/conf.py index cc6e3270..a89cfee5 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -82,10 +82,11 @@ ('py:class', 'enum.Enum'), ('py:class', 'ellipsis'), ('py:class', 'Scalar'), + ('py:class', 'AnyScalar'), + ('py:class', 'NullType'), ('py:class', 'Bool'), ('py:class', 'optional'), ('py:class', 'Aggregation'), - ('py:class', 'NullType'), ('py:class', 'Namespace'), ('py:class', 'SupportsDataFrameAPI'), ('py:class', 'Self'), diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index c85812eb..ee0c3541 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -6,9 +6,9 @@ it is also potentially problematic when trying to write performant dataframe library code or supporting devices other than CPU. This standard specifies the use of Python types in quite a few places, and uses -them as type annotations. As a concrete example, consider the `mean` method and -the `float` it is documented to return, in combination with the `__gt__` method -(i.e., the `>` operator) on the dataframe: +them as type annotations. As a concrete example, consider the `mean` method, +the `bool | Scalar` argument it takes, and the `Scalar` it is documented to return, +in combination with the `__gt__` method (i.e., the `>` operator) on the dataframe: ```python class DataFrame: @@ -18,24 +18,51 @@ class DataFrame: ... class Column: - def mean(self, skip_nulls: bool = True) -> float | NullType: + def mean(self, skip_nulls: bool | Scalar = True) -> Scalar: ... -larger = df2 > df1.col('foo').mean() +larger = df2 > df1.col('foo', skip_nulls = True).mean() ``` -For a GPU dataframe library, it is desirable for all data to reside on the GPU, -and not incur a performance penalty from synchronizing instances of Python -builtin types to CPU. In the above example, the `.mean()` call returns a -`float`. It is likely beneficial though to implement this as a library-specific -scalar object which duck types with `float`. This means that it should (a) have -the same semantics as a builtin `float` when used within a library, and (b) -support usage as a `float` outside of the library (i.e., implement -`__float__`). Duck typing is usually not perfect, for example `isinstance` -usage on the float-like duck type will behave differently. Such explicit "type -of object" checks don't have to be supported. - -The following design rule applies everywhere builtin Python types are used -within this API standard: _where a Python builtin type is specified, an -implementation may always replace it by an equivalent library-specific type -that duck types with the Python builtin type._ +Let's go through these arguments: +- `skip_nulls: bool | Scalar`. This means we can either pass a Python `bool`, or + a `Scalar` object backed by a boolean; +- the return value of `.mean()` is a `Scalar` +- the argument `other` of `__gt__` is typed as `AnyScalar`, meaning that we can + compare a `DataFrame` with a Python scalar (e.g. `df > 3`) or with a `Scalar` + (e.g. `df > df.col('a').mean()`) +- the return value of `__gt__` is a `Scalar` + +Returning values as `Scalar` allows scalars to reside on different devices (e.g. GPU), +or to stay lazy (if a library allows it). + +## Example + +For example, if a library implements `FancyFloat` and `FancyBool` scalars, +then the following should all be supported: +```python +df: DataFrame +column_1: Column = df.col('a') +column_2: Column = df.col('b') + +scalar: FancyFloat = column_1.std() +result_1: Column = column_2 - column_1.std() +result_2: FancyBool = column_2.std() > column_1.std() +``` + +Note that the scalars above are library-specific ones - they may be used to keep +data on GPU, or to keep data lazy. + +The following, however, may raise, dependening on the +implementation: +```python +df: DataFrame +column = df.col('a') + +if column.std() > 0: # this line may raise! + print('std is positive') +``` +This is because `if column.std() > 0` will call `(column.std() > 0).__bool__()`, +which is required by Python to produce a Python scalar. +Therefore, a purely lazy dataframe library may choose to raise here, whereas as +one which allows for eager execution may return a Python bool.