Rework groupby and resample core modules (pandas-dev#848)

* Rework groupby and resample core modules * Runtime Series is not generic * Remove default values and deprecated * Use np.integer in tests * Add a comment for Incomplete * Remove private objects * Remove deprecated Resampler.fillna * Remove private constructors * Remove more private constructors * Temporarily type labelsize as int Needs fix everywhere and the upstream docs should be updated * Tighten rolling and expanding method type * Remove step from groupby rolling * Fix resample rule * Fix groupby fillna * Add missing test for linked issue * Remove pandas.core.apply as it is not used in public code * Address CR * Address remaining CR * revert pyproject change * Temporarily pin pyright to unblock CI * New deprecations * Windows tests * Apply suggestions from code review
skatsuta · Feb 6, 2024 · e35c3ca · e35c3ca
1 parent 56eafc1
commit e35c3ca
Show file tree

Hide file tree

Showing 28 changed files with 2,325 additions and 851 deletions.
diff --git a/pandas-stubs/_libs/properties.pyi b/pandas-stubs/_libs/properties.pyi
@@ -5,7 +5,10 @@ class CachedProperty:
     def __get__(self, obj, typ): ...
     def __set__(self, obj, value) -> None: ...
 
-cache_readonly: CachedProperty = ...
+# note: this is a lie to make type checkers happy (they special
+# case property). cache_readonly uses attribute names similar to
+# property (fget) but it does not provide fset and fdel.
+cache_readonly = property
 
 class AxisProperty:
     def __init__(self, axis: int = ..., doc: str = ...) -> None: ...

diff --git a/pandas-stubs/_typing.pyi b/pandas-stubs/_typing.pyi
@@ -48,6 +48,11 @@ from pandas.core.dtypes.dtypes import (
 
 from pandas.io.formats.format import EngFormatter
 
+# `Incomplete` is equivalent to `Any`. Use it to annotate symbols that you don't
+# know the type of yet and that should be changed in the future. Use `Any` only
+# where it is the only acceptable type.
+Incomplete: TypeAlias = Any
+
 ArrayLike: TypeAlias = ExtensionArray | np.ndarray
 AnyArrayLike: TypeAlias = Index | Series | np.ndarray
 PythonScalar: TypeAlias = str | bool | complex
@@ -80,6 +85,10 @@ class FulldatetimeDict(YearMonthDayDict, total=False):
     us: DatetimeDictArg
     ns: DatetimeDictArg
 
+CorrelationMethod: TypeAlias = (
+    Literal["pearson", "kendall", "spearman"]
+    | Callable[[np.ndarray, np.ndarray], float]
+)
 # dtypes
 NpDtype: TypeAlias = str | np.dtype[np.generic] | type[str | complex | bool | object]
 Dtype: TypeAlias = ExtensionDtype | NpDtype
@@ -444,6 +453,7 @@ class SequenceNotStr(Protocol[_T_co]):
 IndexLabel: TypeAlias = Hashable | Sequence[Hashable]
 Label: TypeAlias = Hashable | None
 Level: TypeAlias = Hashable | int
+Shape: TypeAlias = tuple[int, ...]
 Suffixes: TypeAlias = tuple[str | None, str | None]
 Ordered: TypeAlias = bool | None
 JSONSerializable: TypeAlias = PythonScalar | list | dict
@@ -469,8 +479,11 @@ AggFuncTypeSeriesToFrame: TypeAlias = list[AggFuncTypeBase] | AggFuncTypeDictSer
 AggFuncTypeFrame: TypeAlias = (
     AggFuncTypeBase | list[AggFuncTypeBase] | AggFuncTypeDictFrame
 )
+AggFuncTypeDict: TypeAlias = AggFuncTypeDictSeries | AggFuncTypeDictFrame
+AggFuncType: TypeAlias = AggFuncTypeBase | list[AggFuncTypeBase] | AggFuncTypeDict
 
 num: TypeAlias = complex
+AxisInt: TypeAlias = int
 AxisIndex: TypeAlias = Literal["index", 0]
 AxisColumn: TypeAlias = Literal["columns", 1]
 Axis: TypeAlias = AxisIndex | AxisColumn
@@ -563,6 +576,9 @@ IndexT = TypeVar("IndexT", bound=Index)
 IntervalT = TypeVar("IntervalT", bound=Interval)
 IntervalClosedType: TypeAlias = Literal["left", "right", "both", "neither"]
 
+ScalarIndexer: TypeAlias = int | np.integer
+SequenceIndexer: TypeAlias = slice | list[int] | np.ndarray
+PositionalIndexer: TypeAlias = ScalarIndexer | SequenceIndexer
 TakeIndexer: TypeAlias = Sequence[int] | Sequence[np.integer] | npt.NDArray[np.integer]
 
 IgnoreRaiseCoerce: TypeAlias = Literal["ignore", "raise", "coerce"]
@@ -758,5 +774,9 @@ RandomState: TypeAlias = (
     | np.random.BitGenerator
     | np.random.RandomState
 )
+Frequency: TypeAlias = str | BaseOffset
+TimeGrouperOrigin: TypeAlias = (
+    Timestamp | Literal["epoch", "start", "start_day", "end", "end_day"]
+)
 
 __all__ = ["npt", "type_t"]
diff --git a/pandas-stubs/core/base.pyi b/pandas-stubs/core/base.pyi
@@ -1,7 +1,12 @@
-from collections.abc import Iterator
+from collections.abc import (
+    Hashable,
+    Iterator,
+)
 from typing import (
+    Any,
     Generic,
     Literal,
+    final,
 )
 
 import numpy as np
@@ -19,13 +24,19 @@ from pandas._typing import (
     Scalar,
     npt,
 )
+from pandas.util._decorators import cache_readonly
 
 class NoNewAttributesMixin:
-    def __setattr__(self, key, value) -> None: ...
+    def __setattr__(self, key: str, value: Any) -> None: ...
 
 class SelectionMixin(Generic[NDFrameT]):
+    obj: NDFrameT
+    exclusions: frozenset[Hashable]
+    @final
+    @cache_readonly
     def ndim(self) -> int: ...
     def __getitem__(self, key): ...
+    def aggregate(self, func, *args, **kwargs): ...
 
 class IndexOpsMixin(OpsMixin, Generic[S1]):
     __array_priority__: int = ...

diff --git a/pandas-stubs/core/frame.pyi b/pandas-stubs/core/frame.pyi
@@ -41,7 +41,6 @@ from pandas.core.indexing import (
     _LocIndexer,
 )
 from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
-from pandas.core.resample import Resampler
 from pandas.core.series import Series
 from pandas.core.window import (
     Expanding,
@@ -54,6 +53,7 @@ from pandas.core.window.rolling import (
 from typing_extensions import Self
 import xarray as xr
 
+from pandas._libs.lib import NoDefault
 from pandas._libs.missing import NAType
 from pandas._libs.tslibs import BaseOffset
 from pandas._libs.tslibs.nattype import NaTType
@@ -1006,104 +1006,96 @@ class DataFrame(NDFrame, OpsMixin):
     def groupby(
         self,
         by: Scalar,
-        axis: AxisIndex = ...,
+        axis: AxisIndex | NoDefault = ...,
         level: IndexLabel | None = ...,
         as_index: _bool = ...,
         sort: _bool = ...,
         group_keys: _bool = ...,
-        squeeze: _bool = ...,
-        observed: _bool = ...,
+        observed: _bool | NoDefault = ...,
         dropna: _bool = ...,
     ) -> DataFrameGroupBy[Scalar]: ...
     @overload
     def groupby(
         self,
         by: DatetimeIndex,
-        axis: AxisIndex = ...,
+        axis: AxisIndex | NoDefault = ...,
         level: IndexLabel | None = ...,
         as_index: _bool = ...,
         sort: _bool = ...,
         group_keys: _bool = ...,
-        squeeze: _bool = ...,
-        observed: _bool = ...,
+        observed: _bool | NoDefault = ...,
         dropna: _bool = ...,
     ) -> DataFrameGroupBy[Timestamp]: ...
     @overload
     def groupby(
         self,
         by: TimedeltaIndex,
-        axis: AxisIndex = ...,
+        axis: AxisIndex | NoDefault = ...,
         level: IndexLabel | None = ...,
         as_index: _bool = ...,
         sort: _bool = ...,
         group_keys: _bool = ...,
-        squeeze: _bool = ...,
-        observed: _bool = ...,
+        observed: _bool | NoDefault = ...,
         dropna: _bool = ...,
     ) -> DataFrameGroupBy[Timedelta]: ...
     @overload
     def groupby(
         self,
         by: PeriodIndex,
-        axis: AxisIndex = ...,
+        axis: AxisIndex | NoDefault = ...,
         level: IndexLabel | None = ...,
         as_index: _bool = ...,
         sort: _bool = ...,
         group_keys: _bool = ...,
-        squeeze: _bool = ...,
-        observed: _bool = ...,
+        observed: _bool | NoDefault = ...,
         dropna: _bool = ...,
     ) -> DataFrameGroupBy[Period]: ...
     @overload
     def groupby(
         self,
         by: IntervalIndex[IntervalT],
-        axis: AxisIndex = ...,
+        axis: AxisIndex | NoDefault = ...,
         level: IndexLabel | None = ...,
         as_index: _bool = ...,
         sort: _bool = ...,
         group_keys: _bool = ...,
-        squeeze: _bool = ...,
-        observed: _bool = ...,
+        observed: _bool | NoDefault = ...,
         dropna: _bool = ...,
     ) -> DataFrameGroupBy[IntervalT]: ...
     @overload
     def groupby(
         self,
         by: MultiIndex | GroupByObjectNonScalar | None = ...,
-        axis: AxisIndex = ...,
+        axis: AxisIndex | NoDefault = ...,
         level: IndexLabel | None = ...,
         as_index: _bool = ...,
         sort: _bool = ...,
         group_keys: _bool = ...,
-        squeeze: _bool = ...,
-        observed: _bool = ...,
+        observed: _bool | NoDefault = ...,
         dropna: _bool = ...,
     ) -> DataFrameGroupBy[tuple]: ...
     @overload
     def groupby(
         self,
         by: Series[SeriesByT],
-        axis: AxisIndex = ...,
+        axis: AxisIndex | NoDefault = ...,
         level: IndexLabel | None = ...,
         as_index: _bool = ...,
         sort: _bool = ...,
         group_keys: _bool = ...,
-        squeeze: _bool = ...,
-        observed: _bool = ...,
+        observed: _bool | NoDefault = ...,
         dropna: _bool = ...,
     ) -> DataFrameGroupBy[SeriesByT]: ...
     @overload
     def groupby(
         self,
         by: CategoricalIndex | Index | Series,
-        axis: AxisIndex = ...,
+        axis: AxisIndex | NoDefault = ...,
         level: IndexLabel | None = ...,
         as_index: _bool = ...,
         sort: _bool = ...,
         group_keys: _bool = ...,
-        squeeze: _bool = ...,
-        observed: _bool = ...,
+        observed: _bool | NoDefault = ...,
         dropna: _bool = ...,
     ) -> DataFrameGroupBy[Any]: ...
     def pivot(
@@ -1921,21 +1913,6 @@ class DataFrame(NDFrame, OpsMixin):
         *,
         inplace: Literal[False] = ...,
     ) -> DataFrame: ...
-    def resample(
-        self,
-        rule,
-        axis: Axis = ...,
-        closed: _str | None = ...,
-        label: _str | None = ...,
-        convention: TimestampConvention = ...,
-        kind: Literal["timestamp", "period"] | None = ...,
-        on: _str | None = ...,
-        level: Level | None = ...,
-        origin: Timestamp
-        | Literal["epoch", "start", "start_day", "end", "end_day"] = ...,
-        offset: dt.timedelta | Timedelta | _str | None = ...,
-        group_keys: _bool = ...,
-    ) -> Resampler[DataFrame]: ...
     def rfloordiv(
         self,
         other,

diff --git a/pandas-stubs/core/generic.pyi b/pandas-stubs/core/generic.pyi
@@ -5,6 +5,7 @@ from collections.abc import (
     Mapping,
     Sequence,
 )
+import datetime as dt
 import sqlite3
 from typing import (
     Any,
@@ -17,13 +18,15 @@ from typing import (
 import numpy as np
 from pandas import Index
 import pandas.core.indexing as indexing
+from pandas.core.resample import DatetimeIndexResampler
 from pandas.core.series import Series
 import sqlalchemy.engine
 from typing_extensions import (
     Concatenate,
     Self,
 )
 
+from pandas._libs.lib import NoDefault
 from pandas._typing import (
     S1,
     ArrayLike,
@@ -37,6 +40,7 @@ from pandas._typing import (
     FilePath,
     FileWriteMode,
     FillnaOptions,
+    Frequency,
     HashableT1,
     HashableT2,
     HDFCompLib,
@@ -48,6 +52,10 @@ from pandas._typing import (
     SortKind,
     StorageOptions,
     T,
+    TimedeltaConvertibleTypes,
+    TimeGrouperOrigin,
+    TimestampConvention,
+    TimestampConvertibleTypes,
     WriteBuffer,
 )
 
@@ -432,6 +440,21 @@ class NDFrame(indexing.IndexingMixin):
         end_time,
         axis=...,
     ) -> Self: ...
+    @final
+    def resample(
+        self,
+        rule: Frequency | dt.timedelta,
+        axis: Axis | NoDefault = ...,
+        closed: Literal["right", "left"] | None = ...,
+        label: Literal["right", "left"] | None = ...,
+        convention: TimestampConvention = ...,
+        kind: Literal["period", "timestamp"] | None = ...,
+        on: Level | None = ...,
+        level: Level | None = ...,
+        origin: TimeGrouperOrigin | TimestampConvertibleTypes = ...,
+        offset: TimedeltaConvertibleTypes | None = ...,
+        group_keys: _bool = ...,
+    ) -> DatetimeIndexResampler[Self]: ...
     def first(self, offset) -> Self: ...
     def last(self, offset) -> Self: ...
     def rank(

diff --git a/pandas-stubs/core/groupby/__init__.pyi b/pandas-stubs/core/groupby/__init__.pyi
@@ -1,2 +1,15 @@
-from pandas.core.groupby.generic import NamedAgg as NamedAgg
+from pandas.core.groupby.generic import (
+    DataFrameGroupBy as DataFrameGroupBy,
+    NamedAgg as NamedAgg,
+    SeriesGroupBy as SeriesGroupBy,
+)
+from pandas.core.groupby.groupby import GroupBy as GroupBy
 from pandas.core.groupby.grouper import Grouper as Grouper
+
+__all__ = [
+    "DataFrameGroupBy",
+    "NamedAgg",
+    "SeriesGroupBy",
+    "GroupBy",
+    "Grouper",
+]
diff --git a/pandas-stubs/core/groupby/base.pyi b/pandas-stubs/core/groupby/base.pyi
@@ -1,4 +1,3 @@
-# from pandas.core.dtypes.common import is_list_like as is_list_like, is_scalar as is_scalar
 from collections.abc import Hashable
 import dataclasses
 

diff --git a/pandas-stubs/core/groupby/categorical.pyi b/pandas-stubs/core/groupby/categorical.pyi
@@ -1,6 +0,0 @@
-from pandas.core.arrays.categorical import (  # , CategoricalDtype as CategoricalDtype
-    Categorical,
-)
-
-def recode_for_groupby(c: Categorical, sort: bool, observed: bool): ...
-def recode_from_groupby(c: Categorical, sort: bool, ci): ...