From 3505c7a4227fbff1fa7e807e33845f6d8db58e04 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 28 Feb 2024 22:32:42 -0500 Subject: [PATCH 1/8] BUG: groupby.agg should always agg --- doc/source/development/maintaining.rst | 28 ---- doc/source/whatsnew/v3.0.0.rst | 7 +- pandas/core/groupby/generic.py | 158 ++++++++++++------ pandas/core/groupby/ops.py | 13 +- pandas/core/indexes/range.py | 14 ++ pandas/core/resample.py | 65 ++++++- pandas/io/excel/_calamine.py | 2 +- .../tests/groupby/aggregate/test_aggregate.py | 59 ++++--- pandas/tests/groupby/aggregate/test_other.py | 2 +- pandas/tests/groupby/test_categorical.py | 4 +- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_reductions.py | 24 +-- pandas/tests/indexes/ranges/test_range.py | 38 +++++ pandas/tests/io/pytables/test_append.py | 2 + pandas/tests/resample/test_resample_api.py | 4 +- web/pandas/community/benchmarks.md | 79 +++++++++ web/pandas/config.yml | 2 + 17 files changed, 371 insertions(+), 132 deletions(-) create mode 100644 web/pandas/community/benchmarks.md diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index f177684b8c98f..5d833dca50732 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -326,34 +326,6 @@ a milestone before tagging, you can request the bot to backport it with: @Meeseeksdev backport -.. _maintaining.asv-machine: - -Benchmark machine ------------------ - -The team currently owns dedicated hardware for hosting a website for pandas' ASV performance benchmark. The results -are published to https://asv-runner.github.io/asv-collection/pandas/ - -Configuration -````````````` - -The machine can be configured with the `Ansible `_ playbook in https://github.com/tomaugspurger/asv-runner. - -Publishing -`````````` - -The results are published to another GitHub repository, https://github.com/tomaugspurger/asv-collection. -Finally, we have a cron job on our docs server to pull from https://github.com/tomaugspurger/asv-collection, to serve them from ``/speed``. -Ask Tom or Joris for access to the webserver. - -Debugging -````````` - -The benchmarks are scheduled by Airflow. It has a dashboard for viewing and debugging the results. You'll need to setup an SSH tunnel to view them - - ssh -L 8080:localhost:8080 pandas@panda.likescandy.com - - .. _maintaining.release: Release process diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a95f0485abd5f..ceccec47a4269 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -191,6 +191,7 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`) - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`) +- :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`) - All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`) - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`) - All arguments in :meth:`Index.sort_values` are now keyword only (:issue:`56493`) @@ -238,12 +239,12 @@ Removal of prior version deprecations/changes - Removed unused arguments ``*args`` and ``**kwargs`` in :class:`Resampler` methods (:issue:`50977`) - Unrecognized timezones when parsing strings to datetimes now raises a ``ValueError`` (:issue:`51477`) - .. --------------------------------------------------------------------------- .. _whatsnew_300.performance: Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) - Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`) - Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`) - Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`) @@ -252,11 +253,11 @@ Performance improvements - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`) - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) +- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) -- Performance improvement in indexing operations for string dtypes (:issue:`56997`) -- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) +- Performance improvement in indexing operations for string dtypes (:issue:`56997`) .. --------------------------------------------------------------------------- .. _whatsnew_300.bug_fixes: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ab5e8bbd4528c..9d9d5a7e49ed1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -62,10 +62,7 @@ ) import pandas.core.common as com from pandas.core.frame import DataFrame -from pandas.core.groupby import ( - base, - ops, -) +from pandas.core.groupby import base from pandas.core.groupby.groupby import ( GroupBy, GroupByPlot, @@ -373,34 +370,64 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) index=self._grouper.result_index, dtype=obj.dtype, ) + return self._python_agg_general(func, *args, **kwargs) - if self._grouper.nkeys > 1: - return self._python_agg_general(func, *args, **kwargs) + agg = aggregate - try: - return self._python_agg_general(func, *args, **kwargs) - except KeyError: - # KeyError raised in test_groupby.test_basic is bc the func does - # a dictionary lookup on group.name, but group name is not - # pinned in _python_agg_general, only in _aggregate_named - result = self._aggregate_named(func, *args, **kwargs) + def _agg_for_resample( + self, func=None, *args, engine=None, engine_kwargs=None, **kwargs + ): + relabeling = func is None + columns = None + if relabeling: + columns, func = validate_func_kwargs(kwargs) + kwargs = {} - warnings.warn( - "Pinning the groupby key to each group in " - f"{type(self).__name__}.agg is deprecated, and cases that " - "relied on it will raise in a future version. " - "If your operation requires utilizing the groupby keys, " - "iterate over the groupby object instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) + if isinstance(func, str): + if maybe_use_numba(engine) and engine is not None: + # Not all agg functions support numba, only propagate numba kwargs + # if user asks for numba, and engine is not None + # (if engine is None, the called function will handle the case where + # numba is requested via the global option) + kwargs["engine"] = engine + if engine_kwargs is not None: + kwargs["engine_kwargs"] = engine_kwargs + return getattr(self, func)(*args, **kwargs) - # result is a dict whose keys are the elements of result_index - result = Series(result, index=self._grouper.result_index) - result = self._wrap_aggregated_output(result) - return result + elif isinstance(func, abc.Iterable): + # Catch instances of lists / tuples + # but not the class list / tuple itself. + func = maybe_mangle_lambdas(func) + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + ret = self._aggregate_multiple_funcs(func, *args, **kwargs) + if relabeling: + # columns is not narrowed by mypy from relabeling flag + assert columns is not None # for mypy + ret.columns = columns + if not self.as_index: + ret = ret.reset_index() + return ret - agg = aggregate + else: + if maybe_use_numba(engine): + return self._aggregate_with_numba( + func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + + if self.ngroups == 0: + # e.g. test_evaluate_with_empty_groups without any groups to + # iterate over, we have no output on which to do dtype + # inference. We default to using the existing dtype. + # xref GH#51445 + obj = self._obj_with_exclusions + return self.obj._constructor( + [], + name=self.obj.name, + index=self._grouper.result_index, + dtype=obj.dtype, + ) + return self._python_agg_general(func, *args, **kwargs) def _python_agg_general(self, func, *args, **kwargs): f = lambda x: func(x, *args, **kwargs) @@ -527,26 +554,6 @@ def _wrap_applied_output( result.index = default_index(len(result)) return result - def _aggregate_named(self, func, *args, **kwargs): - # Note: this is very similar to _aggregate_series_pure_python, - # but that does not pin group.name - result = {} - initialized = False - - for name, group in self._grouper.get_iterator(self._obj_with_exclusions): - # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations - object.__setattr__(group, "name", name) - - output = func(group, *args, **kwargs) - output = ops.extract_result(output) - if not initialized: - # We only do this validation on the first iteration - ops.check_result_array(output, group.dtype) - initialized = True - result[name] = output - - return result - __examples_series_doc = dedent( """ >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0], @@ -1566,6 +1573,61 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) relabeling, func, columns, order = reconstruct_func(func, **kwargs) func = maybe_mangle_lambdas(func) + if maybe_use_numba(engine): + # Not all agg functions support numba, only propagate numba kwargs + # if user asks for numba + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + + op = GroupByApply(self, func, args=args, kwargs=kwargs) + result = op.agg() + if not is_dict_like(func) and result is not None: + # GH #52849 + if not self.as_index and is_list_like(func): + return result.reset_index() + else: + return result + elif relabeling: + # this should be the only (non-raising) case with relabeling + # used reordered index of columns + result = cast(DataFrame, result) + result = result.iloc[:, order] + result = cast(DataFrame, result) + # error: Incompatible types in assignment (expression has type + # "Optional[List[str]]", variable has type + # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], + # Index, Series], Sequence[Any]]") + result.columns = columns # type: ignore[assignment] + + if result is None: + # Remove the kwargs we inserted + # (already stored in engine, engine_kwargs arguments) + if "engine" in kwargs: + del kwargs["engine"] + del kwargs["engine_kwargs"] + # at this point func is not a str, list-like, dict-like, + # or a known callable(e.g. sum) + if maybe_use_numba(engine): + return self._aggregate_with_numba( + func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + # grouper specific aggregations + result = self._python_agg_general(func, *args, **kwargs) + + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + + return result + + agg = aggregate + + def _agg_for_resample( + self, func=None, *args, engine=None, engine_kwargs=None, **kwargs + ): + relabeling, func, columns, order = reconstruct_func(func, **kwargs) + func = maybe_mangle_lambdas(func) + if maybe_use_numba(engine): # Not all agg functions support numba, only propagate numba kwargs # if user asks for numba @@ -1642,8 +1704,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) return result - agg = aggregate - def _python_agg_general(self, func, *args, **kwargs): f = lambda x: func(x, *args, **kwargs) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fc5747595ad02..8b2b7f71757f0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -588,6 +588,7 @@ def __init__( self._groupings: list[grouper.Grouping] = list(groupings) self._sort = sort self.dropna = dropna + self._is_resample = False @property def groupings(self) -> list[grouper.Grouping]: @@ -939,12 +940,14 @@ def _aggregate_series_pure_python( for i, group in enumerate(splitter): res = func(group) - res = extract_result(res) - if not initialized: - # We only do this validation on the first iteration - check_result_array(res, group.dtype) - initialized = True + if self._is_resample: + res = extract_result(res) + + if not initialized: + # We only do this validation on the first iteration + check_result_array(res, group.dtype) + initialized = True result[i] = res diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 8a2e3fbf500a4..c5036a2b32967 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -29,6 +29,7 @@ doc, ) +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_platform_int, ensure_python_int, @@ -42,6 +43,7 @@ from pandas.core import ops import pandas.core.common as com from pandas.core.construction import extract_array +from pandas.core.indexers import check_array_indexer import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, @@ -1048,6 +1050,18 @@ def __getitem__(self, key): "and integer or boolean " "arrays are valid indices" ) + elif com.is_bool_indexer(key): + if isinstance(getattr(key, "dtype", None), ExtensionDtype): + np_key = key.to_numpy(dtype=bool, na_value=False) + else: + np_key = np.asarray(key, dtype=bool) + check_array_indexer(self._range, np_key) # type: ignore[arg-type] + # Short circuit potential _shallow_copy check + if np_key.all(): + return self._simple_new(self._range, name=self.name) + elif not np_key.any(): + return self._simple_new(_empty_range, name=self.name) + return self.take(np.flatnonzero(np_key)) return super().__getitem__(key) def _getitem_slice(self, slobj: slice) -> Self: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 4a5feb92c02f9..0e7dc2ca92c68 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -249,6 +249,7 @@ def _get_binner(self): binner, bins, binlabels = self._get_binner_for_time() assert len(bins) == len(binlabels) bin_grouper = BinGrouper(bins, binlabels, indexer=self._indexer) + bin_grouper._is_resample = True return binner, bin_grouper @overload @@ -372,7 +373,21 @@ def aggregate(self, func=None, *args, **kwargs): return result agg = aggregate - apply = aggregate + + @final + @doc( + _shared_docs["aggregate"], + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + klass="DataFrame", + axis="", + ) + def apply(self, func=None, *args, **kwargs): + result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() + if result is None: + how = func + result = self._groupby_and_apply(how, *args, **kwargs) + return result @final def transform(self, arg, *args, **kwargs): @@ -489,6 +504,53 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): return self._wrap_result(result) + def _groupby_and_apply(self, how, *args, **kwargs): + """ + Re-evaluate the obj with a groupby aggregation. + """ + grouper = self._grouper + + # Excludes `on` column when provided + obj = self._obj_with_exclusions + + grouped = get_groupby(obj, by=None, grouper=grouper, group_keys=self.group_keys) + + try: + if callable(how): + # TODO: test_resample_apply_with_additional_args fails if we go + # through the non-lambda path, not clear that it should. + func = lambda x: how(x, *args, **kwargs) + result = grouped._agg_for_resample(func) + else: + result = grouped._agg_for_resample(how, *args, **kwargs) + except (AttributeError, KeyError): + # we have a non-reducing function; try to evaluate + # alternatively we want to evaluate only a column of the input + + # test_apply_to_one_column_of_df the function being applied references + # a DataFrame column, but aggregate_item_by_item operates column-wise + # on Series, raising AttributeError or KeyError + # (depending on whether the column lookup uses getattr/__getitem__) + result = _apply( + grouped, how, *args, include_groups=self.include_groups, **kwargs + ) + + except ValueError as err: + if "Must produce aggregated value" in str(err): + # raised in _aggregate_named + # see test_apply_without_aggregation, test_apply_with_mutated_index + pass + else: + raise + + # we have a non-reducing function + # try to evaluate + result = _apply( + grouped, how, *args, include_groups=self.include_groups, **kwargs + ) + + return self._wrap_result(result) + @final def _get_resampler_for_grouping( self, groupby: GroupBy, key, include_groups: bool = True @@ -1544,6 +1606,7 @@ def func(x): _upsample = _apply _downsample = _apply _groupby_and_aggregate = _apply + _groupby_and_apply = _apply @final def _gotitem(self, key, ndim, subset=None): diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py index 1f721c65982d4..b8994e679d4b1 100644 --- a/pandas/io/excel/_calamine.py +++ b/pandas/io/excel/_calamine.py @@ -75,7 +75,7 @@ def load_workbook( from python_calamine import load_workbook return load_workbook( - filepath_or_buffer, # type: ignore[arg-type] + filepath_or_buffer, **engine_kwargs, ) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 255784e8bf24d..4185ab4af0bbb 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -47,13 +47,27 @@ def test_agg_regression1(tsframe): def test_agg_must_agg(df): grouped = df.groupby("A")["C"] + expected = pd.Series( + { + "bar": df[df.A == "bar"]["C"].describe(), + "foo": df[df.A == "foo"]["C"].describe(), + }, + index=pd.Index(["bar", "foo"], name="A"), + name="C", + ) + result = grouped.agg(lambda x: x.describe()) + tm.assert_series_equal(result, expected) - msg = "Must produce aggregated value" - with pytest.raises(Exception, match=msg): - grouped.agg(lambda x: x.describe()) - with pytest.raises(Exception, match=msg): - grouped.agg(lambda x: x.index[:2]) - + expected = pd.Series( + { + "bar": df[df.A == "bar"]["C"].index[:2], + "foo": df[df.A == "foo"]["C"].index[:2], + }, + index=pd.Index(["bar", "foo"], name="A"), + name="C", + ) + result = grouped.agg(lambda x: x.index[:2]) + tm.assert_series_equal(result, expected) def test_agg_ser_multi_key(df): f = lambda x: x.sum() @@ -1440,12 +1454,10 @@ def test_groupby_agg_precision(any_real_numeric_dtype): "key3": pd.array([max_value], dtype=any_real_numeric_dtype), } ) - arrays = [["a"], ["b"]] - index = MultiIndex.from_arrays(arrays, names=("key1", "key2")) - expected = DataFrame( - {"key3": pd.array([max_value], dtype=any_real_numeric_dtype)}, index=index - ) + expected = DataFrame({"key3": [df["key3"]]}, + index=pd.MultiIndex(levels=[["a"], ["b"]], codes=[[0], [0]], names=["key1", "key2"])) + result = df.groupby(["key1", "key2"]).agg(lambda x: x) tm.assert_frame_equal(result, expected) @@ -1525,26 +1537,27 @@ def test_groupby_complex_raises(func): @pytest.mark.parametrize( - "test, constant", + "test, values", [ - ([[20, "A"], [20, "B"], [10, "C"]], {0: [10, 20], 1: ["C", ["A", "B"]]}), - ([[20, "A"], [20, "B"], [30, "C"]], {0: [20, 30], 1: [["A", "B"], "C"]}), - ([["a", 1], ["a", 1], ["b", 2], ["b", 3]], {0: ["a", "b"], 1: [1, [2, 3]]}), - pytest.param( - [["a", 1], ["a", 2], ["b", 3], ["b", 3]], - {0: ["a", "b"], 1: [[1, 2], 3]}, - marks=pytest.mark.xfail, - ), + ([[20, "A"], [20, "B"], [10, "C"]], [10, 20]), + ([[20, "A"], [20, "B"], [30, "C"]], [20, 30]), + ([["a", 1], ["a", 1], ["b", 2], ["b", 3]], ["a", "b"]), + ([["a", 1], ["a", 2], ["b", 3], ["b", 3]], ["a", "b"]), ], ) -def test_agg_of_mode_list(test, constant): +def test_agg_of_mode_list(test, values): # GH#25581 df1 = DataFrame(test) result = df1.groupby(0).agg(Series.mode) # Mode usually only returns 1 value, but can return a list in the case of a tie. - expected = DataFrame(constant) - expected = expected.set_index(0) + expected = DataFrame( + [ + [df1[df1[0] == value][1].mode()] for value in values + ], + index=pd.Index(values, name=0), + columns=[1], + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 12f99e3cf7a63..7363e79cc5303 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -619,7 +619,7 @@ def test_agg_lambda_with_timezone(): ) result = df.groupby("tag").agg({"date": lambda e: e.head(1)}) expected = DataFrame( - [pd.Timestamp("2018-01-01", tz="UTC")], + [[df["date"].head(1)]], index=Index([1], name="tag"), columns=["date"], ) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 76c8a6fdb9570..5b0b017eb9b5b 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1319,9 +1319,7 @@ def test_groupby_cat_preserves_structure(observed, ordered): expected = df.copy() result = ( - df.groupby("Name", observed=observed) - .agg(DataFrame.sum, skipna=True) - .reset_index() + df.groupby("Name", observed=observed).agg(Series.sum, skipna=True).reset_index() ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d02e22c29159f..308d6eb892178 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2471,7 +2471,7 @@ def test_by_column_values_with_same_starting_value(dtype): result = df.groupby(["Name"]).agg(aggregate_details) expected_result = DataFrame( { - "Mood": [["happy", "sad"], "happy"], + "Mood": [pd.Series(["happy", "sad"]), pd.Series(["happy"])], "Credit": [2500, 900], "Name": ["Thomas", "Thomas John"], } diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index e304a5ae467d8..0f5fd50abbec8 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -65,22 +65,16 @@ def test_basic_aggregations(dtype): with pytest.raises(pd.errors.SpecificationError, match=msg): grouped.aggregate({"one": np.mean, "two": np.std}) - group_constants = {0: 10, 1: 20, 2: 30} - msg = ( - "Pinning the groupby key to each group in SeriesGroupBy.agg is deprecated, " - "and cases that relied on it will raise in a future version" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#41090 - agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) - assert agged[1] == 21 - # corner cases - msg = "Must produce aggregated value" - # exception raised is type Exception - with pytest.raises(Exception, match=msg): - grouped.aggregate(lambda x: x * 2) - + result = grouped.aggregate(lambda x: x * 2) + expected = pd.Series( + { + 0: data[data.index // 3 == 0] * 2, + 1: data[data.index // 3 == 1] * 2, + 2: data[data.index // 3 == 2] * 2, + }, + ) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "vals", diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index d500687763a1e..8b4b7a5d70ee4 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -623,3 +623,41 @@ def test_append_one_nonempty_preserve_step(): expected = RangeIndex(0, -1, -1) result = RangeIndex(0).append([expected]) tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_boolmask_all_true(): + ri = RangeIndex(3, name="foo") + expected = ri.copy() + result = ri[[True] * 3] + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_boolmask_all_false(): + ri = RangeIndex(3, name="foo") + result = ri[[False] * 3] + expected = RangeIndex(0, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_boolmask_returns_rangeindex(): + ri = RangeIndex(3, name="foo") + result = ri[[False, True, True]] + expected = RangeIndex(1, 3, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + result = ri[[True, False, True]] + expected = RangeIndex(0, 3, 2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_boolmask_returns_index(): + ri = RangeIndex(4, name="foo") + result = ri[[True, True, False, True]] + expected = Index([0, 1, 3], name="foo") + tm.assert_index_equal(result, expected) + + +def test_getitem_boolmask_wrong_length(): + ri = RangeIndex(4, name="foo") + with pytest.raises(IndexError, match="Boolean index has wrong length"): + ri[[True]] diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index aff08e5ad3882..529d6d789596f 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -132,6 +132,8 @@ def test_append_series(setup_path): # select on the index and values expected = ns[(ns > 70) & (ns.index < 90)] + # Reading/writing RangeIndex info is not supported yet + expected.index = Index(expected.index._data) result = store.select("ns", "foo>70 and index<90") tm.assert_series_equal(result, expected, check_index_type=True) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index f3b9c909290a8..6268a9e90e47d 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -643,10 +643,10 @@ def test_agg_list_like_func_with_args(): ) def foo1(x, a=1, c=0): - return x + a + c + return x.sum() + a + c def foo2(x, b=2, c=0): - return x + b + c + return x.sum() + b + c msg = r"foo1\(\) got an unexpected keyword argument 'b'" with pytest.raises(TypeError, match=msg): diff --git a/web/pandas/community/benchmarks.md b/web/pandas/community/benchmarks.md new file mode 100644 index 0000000000000..ffce00be96bca --- /dev/null +++ b/web/pandas/community/benchmarks.md @@ -0,0 +1,79 @@ +# Benchmarks + +Benchmarks are tests to measure the performance of pandas. There are two different +kinds of benchmarks relevant to pandas: + +* Internal pandas benchmarks to measure speed and memory usage over time +* Community benchmarks comparing the speed or memory usage of different tools at + doing the same job + +## pandas benchmarks + +pandas benchmarks are implemented in the [asv_bench](https://github.com/pandas-dev/pandas/tree/main/asv_bench) +directory of our repository. The benchmarks are implemented for the +[airspeed velocity](https://asv.readthedocs.io/en/v0.6.1/) (asv for short) framework. + +The benchmarks can be run locally by any pandas developer. This can be done +with the `asv run` command, and it can be useful to detect if local changes have +an impact in performance, by running the benchmarks before and after the changes. +More information on running the performance test suite is found +[here](https://pandas.pydata.org/docs/dev/development/contributing_codebase.html#running-the-performance-test-suite). + +Note that benchmarks are not deterministic, and running in different hardware or +running in the same hardware with different levels of stress have a big impact in +the result. Even running the benchmarks with identical hardware and almost identical +conditions produces significant differences when running the same exact code. + +## pandas benchmarks servers + +We currently have two physical servers running the benchmarks of pandas for every +(or almost every) commit to the `main` branch. The servers run independently from +each other. The original server has been running for a long time, and it is physically +located with one of the pandas maintainers. The newer server is in a datacenter +kindly sponsored by [OVHCloud](https://www.ovhcloud.com/). More information about +pandas sponsors, and how your company can support the development of pandas is +available at the [pandas sponsors]({{ base_url }}about/sponsors.html) page. + +Results of the benchmarks are available at: + +- Original server: [asv](https://asv-runner.github.io/asv-collection/pandas/) +- OVH server: [asv](https://pandas.pydata.org/benchmarks/asv/) (benchmarks results can + also be visualized in this [Conbench PoC](http://57.128.112.95:5000/) + +### Original server configuration + +The machine can be configured with the Ansible playbook in +[tomaugspurger/asv-runner](https://github.com/tomaugspurger/asv-runner). +The results are published to another GitHub repository, +[tomaugspurger/asv-collection](https://github.com/tomaugspurger/asv-collection). + +The benchmarks are scheduled by [Airflow](https://airflow.apache.org/). +It has a dashboard for viewing and debugging the results. +You’ll need to setup an SSH tunnel to view them: + +``` +ssh -L 8080:localhost:8080 pandas@panda.likescandy.com +``` + +### OVH server configuration + +The server used to run the benchmarks has been configured to reduce system +noise and maximize the stability of the benchmarks times. + +The details on how the server is configured can be found in the +[pandas-benchmarks repository](https://github.com/pandas-dev/pandas-benchmarks). +There is a quick summary here: + +- CPU isolation: Avoid user space tasks to execute in the same CPU as benchmarks, possibly interrupting them during the execution (include all virtual CPUs using a physical core) +- NoHZ: Stop the kernel tick that enables context switching in the isolated CPU +- IRQ affinity: Ban benchmarks CPU to avoid many (but not all) kernel interruption in the isolated CPU +- TurboBoost: Disable CPU scaling based on high CPU demand +- P-States: Use "performance" governor to disable P-States and CPU frequency changes based on them +- C-States: Set C-State to 0 and disable changes to avoid slower CPU after system inactivity + +## Community benchmarks + +The main benchmarks comparing dataframe tools that include pandas are: + +- [H2O.ai benchmarks](https://h2oai.github.io/db-benchmark/) +- [TPCH benchmarks](https://pola.rs/posts/benchmarks/) diff --git a/web/pandas/config.yml b/web/pandas/config.yml index 27e5ea25c1bad..05fdea13cab43 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -54,6 +54,8 @@ navbar: target: community/coc.html - name: "Ecosystem" target: community/ecosystem.html + - name: "Benchmarks" + target: community/benchmarks.html - name: "Contribute" target: contribute.html blog: From 12ef132ba484dfeb408a8031b5c4473f52c325c4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 2 Mar 2024 13:00:01 -0500 Subject: [PATCH 2/8] cleanup --- doc/source/development/maintaining.rst | 28 ++++++++ doc/source/whatsnew/v3.0.0.rst | 7 +- pandas/core/indexes/range.py | 14 ---- pandas/io/excel/_calamine.py | 2 +- pandas/tests/indexes/ranges/test_range.py | 38 ----------- pandas/tests/io/pytables/test_append.py | 2 - web/pandas/community/benchmarks.md | 79 ----------------------- web/pandas/config.yml | 2 - 8 files changed, 32 insertions(+), 140 deletions(-) delete mode 100644 web/pandas/community/benchmarks.md diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 5d833dca50732..f177684b8c98f 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -326,6 +326,34 @@ a milestone before tagging, you can request the bot to backport it with: @Meeseeksdev backport +.. _maintaining.asv-machine: + +Benchmark machine +----------------- + +The team currently owns dedicated hardware for hosting a website for pandas' ASV performance benchmark. The results +are published to https://asv-runner.github.io/asv-collection/pandas/ + +Configuration +````````````` + +The machine can be configured with the `Ansible `_ playbook in https://github.com/tomaugspurger/asv-runner. + +Publishing +`````````` + +The results are published to another GitHub repository, https://github.com/tomaugspurger/asv-collection. +Finally, we have a cron job on our docs server to pull from https://github.com/tomaugspurger/asv-collection, to serve them from ``/speed``. +Ask Tom or Joris for access to the webserver. + +Debugging +````````` + +The benchmarks are scheduled by Airflow. It has a dashboard for viewing and debugging the results. You'll need to setup an SSH tunnel to view them + + ssh -L 8080:localhost:8080 pandas@panda.likescandy.com + + .. _maintaining.release: Release process diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ceccec47a4269..a95f0485abd5f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -191,7 +191,6 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`) - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`) -- :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`) - All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`) - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`) - All arguments in :meth:`Index.sort_values` are now keyword only (:issue:`56493`) @@ -239,12 +238,12 @@ Removal of prior version deprecations/changes - Removed unused arguments ``*args`` and ``**kwargs`` in :class:`Resampler` methods (:issue:`50977`) - Unrecognized timezones when parsing strings to datetimes now raises a ``ValueError`` (:issue:`51477`) + .. --------------------------------------------------------------------------- .. _whatsnew_300.performance: Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) - Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`) - Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`) - Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`) @@ -253,11 +252,11 @@ Performance improvements - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`) - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) -- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) -- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) +- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``) +- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) .. --------------------------------------------------------------------------- .. _whatsnew_300.bug_fixes: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index c5036a2b32967..8a2e3fbf500a4 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -29,7 +29,6 @@ doc, ) -from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_platform_int, ensure_python_int, @@ -43,7 +42,6 @@ from pandas.core import ops import pandas.core.common as com from pandas.core.construction import extract_array -from pandas.core.indexers import check_array_indexer import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, @@ -1050,18 +1048,6 @@ def __getitem__(self, key): "and integer or boolean " "arrays are valid indices" ) - elif com.is_bool_indexer(key): - if isinstance(getattr(key, "dtype", None), ExtensionDtype): - np_key = key.to_numpy(dtype=bool, na_value=False) - else: - np_key = np.asarray(key, dtype=bool) - check_array_indexer(self._range, np_key) # type: ignore[arg-type] - # Short circuit potential _shallow_copy check - if np_key.all(): - return self._simple_new(self._range, name=self.name) - elif not np_key.any(): - return self._simple_new(_empty_range, name=self.name) - return self.take(np.flatnonzero(np_key)) return super().__getitem__(key) def _getitem_slice(self, slobj: slice) -> Self: diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py index b8994e679d4b1..1f721c65982d4 100644 --- a/pandas/io/excel/_calamine.py +++ b/pandas/io/excel/_calamine.py @@ -75,7 +75,7 @@ def load_workbook( from python_calamine import load_workbook return load_workbook( - filepath_or_buffer, + filepath_or_buffer, # type: ignore[arg-type] **engine_kwargs, ) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 8b4b7a5d70ee4..d500687763a1e 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -623,41 +623,3 @@ def test_append_one_nonempty_preserve_step(): expected = RangeIndex(0, -1, -1) result = RangeIndex(0).append([expected]) tm.assert_index_equal(result, expected, exact=True) - - -def test_getitem_boolmask_all_true(): - ri = RangeIndex(3, name="foo") - expected = ri.copy() - result = ri[[True] * 3] - tm.assert_index_equal(result, expected, exact=True) - - -def test_getitem_boolmask_all_false(): - ri = RangeIndex(3, name="foo") - result = ri[[False] * 3] - expected = RangeIndex(0, name="foo") - tm.assert_index_equal(result, expected, exact=True) - - -def test_getitem_boolmask_returns_rangeindex(): - ri = RangeIndex(3, name="foo") - result = ri[[False, True, True]] - expected = RangeIndex(1, 3, name="foo") - tm.assert_index_equal(result, expected, exact=True) - - result = ri[[True, False, True]] - expected = RangeIndex(0, 3, 2, name="foo") - tm.assert_index_equal(result, expected, exact=True) - - -def test_getitem_boolmask_returns_index(): - ri = RangeIndex(4, name="foo") - result = ri[[True, True, False, True]] - expected = Index([0, 1, 3], name="foo") - tm.assert_index_equal(result, expected) - - -def test_getitem_boolmask_wrong_length(): - ri = RangeIndex(4, name="foo") - with pytest.raises(IndexError, match="Boolean index has wrong length"): - ri[[True]] diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 529d6d789596f..aff08e5ad3882 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -132,8 +132,6 @@ def test_append_series(setup_path): # select on the index and values expected = ns[(ns > 70) & (ns.index < 90)] - # Reading/writing RangeIndex info is not supported yet - expected.index = Index(expected.index._data) result = store.select("ns", "foo>70 and index<90") tm.assert_series_equal(result, expected, check_index_type=True) diff --git a/web/pandas/community/benchmarks.md b/web/pandas/community/benchmarks.md deleted file mode 100644 index ffce00be96bca..0000000000000 --- a/web/pandas/community/benchmarks.md +++ /dev/null @@ -1,79 +0,0 @@ -# Benchmarks - -Benchmarks are tests to measure the performance of pandas. There are two different -kinds of benchmarks relevant to pandas: - -* Internal pandas benchmarks to measure speed and memory usage over time -* Community benchmarks comparing the speed or memory usage of different tools at - doing the same job - -## pandas benchmarks - -pandas benchmarks are implemented in the [asv_bench](https://github.com/pandas-dev/pandas/tree/main/asv_bench) -directory of our repository. The benchmarks are implemented for the -[airspeed velocity](https://asv.readthedocs.io/en/v0.6.1/) (asv for short) framework. - -The benchmarks can be run locally by any pandas developer. This can be done -with the `asv run` command, and it can be useful to detect if local changes have -an impact in performance, by running the benchmarks before and after the changes. -More information on running the performance test suite is found -[here](https://pandas.pydata.org/docs/dev/development/contributing_codebase.html#running-the-performance-test-suite). - -Note that benchmarks are not deterministic, and running in different hardware or -running in the same hardware with different levels of stress have a big impact in -the result. Even running the benchmarks with identical hardware and almost identical -conditions produces significant differences when running the same exact code. - -## pandas benchmarks servers - -We currently have two physical servers running the benchmarks of pandas for every -(or almost every) commit to the `main` branch. The servers run independently from -each other. The original server has been running for a long time, and it is physically -located with one of the pandas maintainers. The newer server is in a datacenter -kindly sponsored by [OVHCloud](https://www.ovhcloud.com/). More information about -pandas sponsors, and how your company can support the development of pandas is -available at the [pandas sponsors]({{ base_url }}about/sponsors.html) page. - -Results of the benchmarks are available at: - -- Original server: [asv](https://asv-runner.github.io/asv-collection/pandas/) -- OVH server: [asv](https://pandas.pydata.org/benchmarks/asv/) (benchmarks results can - also be visualized in this [Conbench PoC](http://57.128.112.95:5000/) - -### Original server configuration - -The machine can be configured with the Ansible playbook in -[tomaugspurger/asv-runner](https://github.com/tomaugspurger/asv-runner). -The results are published to another GitHub repository, -[tomaugspurger/asv-collection](https://github.com/tomaugspurger/asv-collection). - -The benchmarks are scheduled by [Airflow](https://airflow.apache.org/). -It has a dashboard for viewing and debugging the results. -You’ll need to setup an SSH tunnel to view them: - -``` -ssh -L 8080:localhost:8080 pandas@panda.likescandy.com -``` - -### OVH server configuration - -The server used to run the benchmarks has been configured to reduce system -noise and maximize the stability of the benchmarks times. - -The details on how the server is configured can be found in the -[pandas-benchmarks repository](https://github.com/pandas-dev/pandas-benchmarks). -There is a quick summary here: - -- CPU isolation: Avoid user space tasks to execute in the same CPU as benchmarks, possibly interrupting them during the execution (include all virtual CPUs using a physical core) -- NoHZ: Stop the kernel tick that enables context switching in the isolated CPU -- IRQ affinity: Ban benchmarks CPU to avoid many (but not all) kernel interruption in the isolated CPU -- TurboBoost: Disable CPU scaling based on high CPU demand -- P-States: Use "performance" governor to disable P-States and CPU frequency changes based on them -- C-States: Set C-State to 0 and disable changes to avoid slower CPU after system inactivity - -## Community benchmarks - -The main benchmarks comparing dataframe tools that include pandas are: - -- [H2O.ai benchmarks](https://h2oai.github.io/db-benchmark/) -- [TPCH benchmarks](https://pola.rs/posts/benchmarks/) diff --git a/web/pandas/config.yml b/web/pandas/config.yml index 05fdea13cab43..27e5ea25c1bad 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -54,8 +54,6 @@ navbar: target: community/coc.html - name: "Ecosystem" target: community/ecosystem.html - - name: "Benchmarks" - target: community/benchmarks.html - name: "Contribute" target: contribute.html blog: From 87d32aebb4e97be8db9d06be2c9d8c312a06138e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 3 Mar 2024 07:23:40 -0500 Subject: [PATCH 3/8] Test fixup --- pandas/core/groupby/generic.py | 4 +++- pandas/tests/reshape/test_pivot.py | 9 +++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9d9d5a7e49ed1..0f924fcae4d52 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1716,7 +1716,9 @@ def _python_agg_general(self, func, *args, **kwargs): if not len(obj.columns): # e.g. test_margins_no_values_no_cols - return self._python_apply_general(f, self._selected_obj) + return obj._constructor( + index=self._grouper.result_index, columns=obj.columns + ) output: dict[int, ArrayLike] = {} for idx, (name, ser) in enumerate(obj.items()): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 99250dc929997..f7ea9890b0af7 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1198,8 +1198,13 @@ def test_margins_no_values_no_cols(self, data): result = data[["A", "B"]].pivot_table( index=["A", "B"], aggfunc=len, margins=True ) - result_list = result.tolist() - assert sum(result_list[:-1]) == result_list[-1] + index = MultiIndex( + levels=[["bar", "foo", "All"], ["one", "two", ""]], + codes=[[0, 0, 1, 1, 2], [0, 1, 0, 1, 2]], + names=["A", "B"], + ) + expected = DataFrame(index=index, columns=[]) + tm.assert_frame_equal(result, expected) def test_margins_no_values_two_rows(self, data): # Regression test on pivot table: no values passed but rows are a From 0849e1cba4f36b63dd88ec0de9b25543ea0ea413 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 4 Mar 2024 22:27:52 -0500 Subject: [PATCH 4/8] WIP --- pandas/core/groupby/generic.py | 2 +- pandas/tests/reshape/test_pivot.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9d9d5a7e49ed1..c44e36c68733b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1714,7 +1714,7 @@ def _python_agg_general(self, func, *args, **kwargs): obj = self._obj_with_exclusions - if not len(obj.columns): + if self._grouper._is_resample and not len(obj.columns): # e.g. test_margins_no_values_no_cols return self._python_apply_general(f, self._selected_obj) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 99250dc929997..1a54a8c3bb18d 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1198,8 +1198,15 @@ def test_margins_no_values_no_cols(self, data): result = data[["A", "B"]].pivot_table( index=["A", "B"], aggfunc=len, margins=True ) - result_list = result.tolist() - assert sum(result_list[:-1]) == result_list[-1] + expected = DataFrame( + index=MultiIndex( + levels=[["bar", "foo", "All"], ["one", "two", ""]], + codes=[[0, 0, 1, 1, 2], [0, 1, 0, 1, 2]], + names=["A", "B"], + ), + columns=Index([]), + ) + tm.assert_frame_equal(result, expected) def test_margins_no_values_two_rows(self, data): # Regression test on pivot table: no values passed but rows are a From 08d26c6d1dabaab11115e329009f1490a8bfd292 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 5 Mar 2024 18:43:48 -0500 Subject: [PATCH 5/8] Fixup for pivot --- pandas/core/reshape/pivot.py | 2 +- pandas/tests/reshape/test_pivot.py | 38 +++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 36edf6116609b..ab19437332ab8 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -441,7 +441,7 @@ def _generate_marginal_results_without_values( margins_name: Hashable = "All", ): margin_keys: list | Index - if len(cols) > 0: + if len(table.columns) > 0: # need to "interleave" the margins margin_keys = [] diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 1a54a8c3bb18d..e7fcab4241f60 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1214,7 +1214,19 @@ def test_margins_no_values_two_rows(self, data): result = data[["A", "B", "C"]].pivot_table( index=["A", "B"], columns="C", aggfunc=len, margins=True ) - assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0] + expected = DataFrame( + index=MultiIndex( + levels=[["bar", "foo", "All"], ["one", "two", ""]], + codes=[[0, 0, 1, 1, 2], [0, 1, 0, 1, 2]], + names=["A", "B"], + ), + columns=MultiIndex( + levels=[[], ["dull", "shiny"]], + codes=[[], []], + names=[None, "C"], + ), + ) + tm.assert_frame_equal(result, expected) def test_margins_no_values_one_row_one_col(self, data): # Regression test on pivot table: no values passed but row and col @@ -1222,7 +1234,15 @@ def test_margins_no_values_one_row_one_col(self, data): result = data[["A", "B"]].pivot_table( index="A", columns="B", aggfunc=len, margins=True ) - assert result.All.tolist() == [4.0, 7.0, 11.0] + expected = DataFrame( + index=Index(["bar", "foo", "All"], name="A"), + columns=MultiIndex( + levels=[[], ["dull", "shiny"]], + codes=[[], []], + names=[None, "B"], + ), + ) + tm.assert_frame_equal(result, expected) def test_margins_no_values_two_row_two_cols(self, data): # Regression test on pivot table: no values passed but rows and cols @@ -1231,7 +1251,19 @@ def test_margins_no_values_two_row_two_cols(self, data): result = data[["A", "B", "C", "D"]].pivot_table( index=["A", "B"], columns=["C", "D"], aggfunc=len, margins=True ) - assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0] + expected = DataFrame( + index=MultiIndex( + levels=[["bar", "foo", "All"], ["one", "two", ""]], + codes=[[0, 0, 1, 1, 2], [0, 1, 0, 1, 2]], + names=["A", "B"], + ), + columns=MultiIndex( + levels=[[], ["dull", "shiny"], list("abcdefghijk")], + codes=[[], [], []], + names=[None, "C", "D"], + ), + ) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("margin_name", ["foo", "one", 666, None, ["a", "b"]]) def test_pivot_table_with_margins_set_margin_name(self, margin_name, data): From d264e205942e09041077838f26bd8ba654cb27e4 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 20 Mar 2024 23:21:25 -0400 Subject: [PATCH 6/8] Deprecate passing reduction kernels to groupby.agg --- pandas/core/groupby/generic.py | 27 ++++++++ .../tests/groupby/aggregate/test_aggregate.py | 43 +++++++++---- pandas/tests/groupby/aggregate/test_cython.py | 13 +++- pandas/tests/groupby/test_groupby.py | 11 ++-- pandas/tests/groupby/test_raises.py | 64 +++++++++++++++++-- 5 files changed, 132 insertions(+), 26 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 68a076232d30e..1b22cf65bbcd6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,6 +21,7 @@ Union, cast, ) +import warnings import numpy as np @@ -32,6 +33,7 @@ Substitution, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_int64, @@ -62,6 +64,10 @@ import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.groupby import base +from pandas.core.groupby.base import ( + reduction_kernels, + transformation_kernels, +) from pandas.core.groupby.groupby import ( GroupBy, GroupByPlot, @@ -326,6 +332,14 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) kwargs = {} if isinstance(func, str): + if func not in reduction_kernels and not self._grouper._is_resample: + meth = "transform" if func in transformation_kernels else "apply" + warnings.warn( + f"In the future, using the non-aggregation {func=} will raise a " + f"ValueError, use this function with {type(self).__name__}.{meth}", + category=DeprecationWarning, + stacklevel=find_stack_level(), + ) if maybe_use_numba(engine) and engine is not None: # Not all agg functions support numba, only propagate numba kwargs # if user asks for numba, and engine is not None @@ -1562,6 +1576,19 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) kwargs["engine"] = engine kwargs["engine_kwargs"] = engine_kwargs + if ( + isinstance(func, str) + and func not in reduction_kernels + and not self._grouper._is_resample + ): + meth = "transform" if func in transformation_kernels else "apply" + warnings.warn( + f"In the future, using the non-aggregation {func=} will raise a " + f"ValueError, use this function with {type(self).__name__}.{meth}", + category=DeprecationWarning, + stacklevel=find_stack_level(), + ) + op = GroupByApply(self, func, args=args, kwargs=kwargs) result = op.agg() if not is_dict_like(func) and result is not None: diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 001029c8ebf69..288b1b6cefd49 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -48,28 +48,29 @@ def test_agg_regression1(tsframe): def test_agg_must_agg(df): grouped = df.groupby("A")["C"] - expected = pd.Series( + expected = Series( { "bar": df[df.A == "bar"]["C"].describe(), "foo": df[df.A == "foo"]["C"].describe(), }, - index=pd.Index(["bar", "foo"], name="A"), + index=Index(["bar", "foo"], name="A"), name="C", ) result = grouped.agg(lambda x: x.describe()) tm.assert_series_equal(result, expected) - expected = pd.Series( + expected = Series( { "bar": df[df.A == "bar"]["C"].index[:2], "foo": df[df.A == "foo"]["C"].index[:2], }, - index=pd.Index(["bar", "foo"], name="A"), + index=Index(["bar", "foo"], name="A"), name="C", ) result = grouped.agg(lambda x: x.index[:2]) tm.assert_series_equal(result, expected) + def test_agg_ser_multi_key(df): f = lambda x: x.sum() results = df.C.groupby([df.A, df.B]).aggregate(f) @@ -485,6 +486,9 @@ def test_groupby_agg_dict_dup_columns(): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:In the future, using the non-aggregation func:DeprecationWarning" +) @pytest.mark.parametrize( "op", [ @@ -564,7 +568,9 @@ def test_order_aggregate_multiple_funcs(): # GH 25692 df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) - res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) + msg = "using the non-aggregation func='ohlc' will raise" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) result = res.columns.levels[1] expected = Index(["sum", "max", "mean", "ohlc", "min"]) @@ -1377,9 +1383,14 @@ def test_nonagg_agg(): df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]}) g = df.groupby("a") - result = g.agg(["cumsum"]) + msg = "using the non-aggregation func='cumsum' will raise" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = g.agg(["cumsum"]) result.columns = result.columns.droplevel(-1) - expected = g.agg("cumsum") + + msg = "using the non-aggregation func='cumsum' will raise" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.agg("cumsum") tm.assert_frame_equal(result, expected) @@ -1450,8 +1461,12 @@ def test_groupby_agg_precision(any_real_numeric_dtype): } ) - expected = DataFrame({"key3": [df["key3"]]}, - index=pd.MultiIndex(levels=[["a"], ["b"]], codes=[[0], [0]], names=["key1", "key2"])) + expected = DataFrame( + {"key3": [df["key3"]]}, + index=MultiIndex( + levels=[["a"], ["b"]], codes=[[0], [0]], names=["key1", "key2"] + ), + ) result = df.groupby(["key1", "key2"]).agg(lambda x: x) tm.assert_frame_equal(result, expected) @@ -1547,10 +1562,8 @@ def test_agg_of_mode_list(test, values): # Mode usually only returns 1 value, but can return a list in the case of a tie. expected = DataFrame( - [ - [df1[df1[0] == value][1].mode()] for value in values - ], - index=pd.Index(values, name=0), + [[df1[df1[0] == value][1].mode()] for value in values], + index=Index(values, name=0), columns=[1], ) @@ -1659,7 +1672,9 @@ def test_groupby_agg_extension_timedelta_cumsum_with_named_aggregation(): } ) gb = df.groupby("grps") - result = gb.agg(td=("td", "cumsum")) + msg = "using the non-aggregation func='cumsum' will raise" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = gb.agg(td=("td", "cumsum")) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index aafd06e8f88cf..ac2abba2e353f 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -21,6 +21,7 @@ bdate_range, ) import pandas._testing as tm +from pandas.core.groupby.base import reduction_kernels @pytest.mark.parametrize( @@ -287,8 +288,16 @@ def test_read_only_buffer_source_agg(agg): ) df._mgr.arrays[0].flags.writeable = False - result = df.groupby(["species"]).agg({"sepal_length": agg}) - expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) + if agg in reduction_kernels: + warn = None + msg = "" + else: + warn = DeprecationWarning + msg = f"using the non-aggregation func='{agg}' will raise" + with tm.assert_produces_warning(warn, match=msg): + result = df.groupby(["species"]).agg({"sepal_length": agg}) + with tm.assert_produces_warning(warn, match=msg): + expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ac98ae088aacb..c6b1afc646554 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2467,7 +2467,7 @@ def test_by_column_values_with_same_starting_value(dtype): result = df.groupby(["Name"]).agg(aggregate_details) expected_result = DataFrame( { - "Mood": [pd.Series(["happy", "sad"]), pd.Series(["happy"])], + "Mood": [Series(["happy", "sad"]), Series(["happy"])], "Credit": [2500, 900], "Name": ["Thomas", "Thomas John"], } @@ -2935,9 +2935,12 @@ def test_groupby_dropna_with_nunique_unique(): # GH#42016 df = [[1, 1, 1, "A"], [1, None, 1, "A"], [1, None, 2, "A"], [1, None, 3, "A"]] df_dropna = DataFrame(df, columns=["a", "b", "c", "partner"]) - result = df_dropna.groupby(["a", "b", "c"], dropna=False).agg( - {"partner": ["nunique", "unique"]} - ) + + msg = "using the non-aggregation func='unique' will raise" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df_dropna.groupby(["a", "b", "c"], dropna=False).agg( + {"partner": ["nunique", "unique"]} + ) index = MultiIndex.from_tuples( [(1, 1.0, 1), (1, np.nan, 1), (1, np.nan, 2), (1, np.nan, 3)], diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index f9d5de72eda1d..1012e32ef8923 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -15,6 +15,7 @@ Series, ) import pandas._testing as tm +from pandas.core.groupby.base import reduction_kernels from pandas.tests.groupby import get_groupby_method_args @@ -84,8 +85,10 @@ def df_with_cat_col(): return df -def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): - warn_klass = None if warn_msg == "" else FutureWarning +def _call_and_check( + klass, msg, how, gb, groupby_func, args, warn_msg="", warn_category=FutureWarning +): + warn_klass = None if warn_msg == "" else warn_category with tm.assert_produces_warning(warn_klass, match=warn_msg, check_stacklevel=False): if klass is None: if how == "method": @@ -183,9 +186,23 @@ def test_groupby_raises_string( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func not in reduction_kernels and how == "agg": + warn_msg = ( + f"In the future, using the non-aggregation func='{groupby_func}' will " + "raise a ValueError" + ) else: warn_msg = "" - _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) + _call_and_check( + klass, + msg, + how, + gb, + groupby_func, + args, + warn_msg, + warn_category=DeprecationWarning, + ) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -287,12 +304,30 @@ def test_groupby_raises_datetime( if groupby_func in ["any", "all"]: warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" + warn_category = FutureWarning elif groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + warn_category = FutureWarning + elif groupby_func not in reduction_kernels and how == "agg": + warn_msg = ( + f"In the future, using the non-aggregation func='{groupby_func}' will " + "raise a ValueError" + ) + warn_category = DeprecationWarning else: warn_msg = "" - _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) + warn_category = FutureWarning + _call_and_check( + klass, + msg, + how, + gb, + groupby_func, + args, + warn_msg=warn_msg, + warn_category=warn_category, + ) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -487,9 +522,19 @@ def test_groupby_raises_category( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + warn_category = FutureWarning + elif groupby_func not in reduction_kernels and how == "agg": + warn_msg = ( + f"In the future, using the non-aggregation func='{groupby_func}' " + "will raise a ValueError" + ) + warn_category = DeprecationWarning else: warn_msg = "" - _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) + warn_category = FutureWarning + _call_and_check( + klass, msg, how, gb, groupby_func, args, warn_msg, warn_category=warn_category + ) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -660,6 +705,13 @@ def test_groupby_raises_category_on_category( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + warn_category = FutureWarning + elif groupby_func not in reduction_kernels and how == "agg": + warn_msg = f"using the non-aggregation func='{groupby_func}' will raise" + warn_category = DeprecationWarning else: warn_msg = "" - _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) + warn_category = FutureWarning + _call_and_check( + klass, msg, how, gb, groupby_func, args, warn_msg, warn_category=warn_category + ) From 2eeb95ff8dce4ec33458f5dd74d2c3b37e63814f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 23 Mar 2024 13:25:50 -0400 Subject: [PATCH 7/8] Revert & more tests --- pandas/core/groupby/generic.py | 27 ------- .../tests/groupby/aggregate/test_aggregate.py | 71 ++++++++++++++----- pandas/tests/groupby/aggregate/test_cython.py | 13 +--- pandas/tests/groupby/test_groupby.py | 8 +-- pandas/tests/groupby/test_raises.py | 64 ++--------------- 5 files changed, 65 insertions(+), 118 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d2d706be22424..d71714084de1b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,7 +21,6 @@ Union, cast, ) -import warnings import numpy as np @@ -33,7 +32,6 @@ Substitution, doc, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_int64, @@ -64,10 +62,6 @@ import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.groupby import base -from pandas.core.groupby.base import ( - reduction_kernels, - transformation_kernels, -) from pandas.core.groupby.groupby import ( GroupBy, GroupByPlot, @@ -332,14 +326,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) kwargs = {} if isinstance(func, str): - if func not in reduction_kernels and not self._grouper._is_resample: - meth = "transform" if func in transformation_kernels else "apply" - warnings.warn( - f"In the future, using the non-aggregation {func=} will raise a " - f"ValueError, use this function with {type(self).__name__}.{meth}", - category=DeprecationWarning, - stacklevel=find_stack_level(), - ) if maybe_use_numba(engine) and engine is not None: # Not all agg functions support numba, only propagate numba kwargs # if user asks for numba, and engine is not None @@ -1576,19 +1562,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) kwargs["engine"] = engine kwargs["engine_kwargs"] = engine_kwargs - if ( - isinstance(func, str) - and func not in reduction_kernels - and not self._grouper._is_resample - ): - meth = "transform" if func in transformation_kernels else "apply" - warnings.warn( - f"In the future, using the non-aggregation {func=} will raise a " - f"ValueError, use this function with {type(self).__name__}.{meth}", - category=DeprecationWarning, - stacklevel=find_stack_level(), - ) - op = GroupByApply(self, func, args=args, kwargs=kwargs) result = op.agg() if not is_dict_like(func) and result is not None: diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 288b1b6cefd49..5178ae107774f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -486,9 +486,6 @@ def test_groupby_agg_dict_dup_columns(): tm.assert_frame_equal(result, expected) -@pytest.mark.filterwarnings( - "ignore:In the future, using the non-aggregation func:DeprecationWarning" -) @pytest.mark.parametrize( "op", [ @@ -568,9 +565,7 @@ def test_order_aggregate_multiple_funcs(): # GH 25692 df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) - msg = "using the non-aggregation func='ohlc' will raise" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) + res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) result = res.columns.levels[1] expected = Index(["sum", "max", "mean", "ohlc", "min"]) @@ -1176,6 +1171,22 @@ def test_with_kwargs(self): expected = DataFrame({"": [13], "": [30]}) tm.assert_frame_equal(result, expected) + def test_unused_kwargs(self): + # GH#39169 - Passing kwargs used to have agg pass the entire frame rather + # than column-by-column + + # UDF that works on both the entire frame and column-by-column + func = lambda data, **kwargs: np.sum(np.sum(data)) + + df = DataFrame([[1, 2], [3, 4]]) + expected = DataFrame({0: [1, 3], 1: [2, 4]}) + + result = df.groupby(level=0).agg(func) + tm.assert_frame_equal(result, expected) + + result = df.groupby(level=0).agg(func, foo=42) + tm.assert_frame_equal(result, expected) + def test_agg_with_one_lambda(self): # GH 25719, write tests for DataFrameGroupby.agg with only one lambda df = DataFrame( @@ -1262,6 +1273,40 @@ def test_agg_multiple_lambda(self): ) tm.assert_frame_equal(result2, expected) + def test_multiple_udf_same_name(self): + # GH#28570 + quant50 = partial(np.percentile, q=50) + quant70 = partial(np.percentile, q=70) + + df = DataFrame({"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]}) + expected = DataFrame( + [[1.5, 1.7], [4.0, 4.4]], + index=Index(["a", "b"], name="col1"), + columns=MultiIndex( + levels=[["col2"], ["percentile"]], + codes=[[0, 0], [0, 0]], + ), + ) + gb = df.groupby("col1") + result = gb.agg({"col2": [quant50, quant70]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("use_kwargs", [True, False]) + def test_multiple_udf_with_args(self, use_kwargs): + # GH#26611 + def func(x, y): + return x.sum() + y + + df = DataFrame({"A": [1, 2]}) + expected = DataFrame({"A": [13]}) + gb = df.groupby([0, 0]) + if use_kwargs: + args, kwargs = (), {"y": 10} + else: + args, kwargs = (10,), {} + result = gb.agg(func, *args, **kwargs) + tm.assert_frame_equal(result, expected) + def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): # go through _aggregate_frame with self.axis == 0 and duplicate columns @@ -1383,14 +1428,9 @@ def test_nonagg_agg(): df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]}) g = df.groupby("a") - msg = "using the non-aggregation func='cumsum' will raise" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.agg(["cumsum"]) + result = g.agg(["cumsum"]) result.columns = result.columns.droplevel(-1) - - msg = "using the non-aggregation func='cumsum' will raise" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.agg("cumsum") + expected = g.agg("cumsum") tm.assert_frame_equal(result, expected) @@ -1467,7 +1507,6 @@ def test_groupby_agg_precision(any_real_numeric_dtype): levels=[["a"], ["b"]], codes=[[0], [0]], names=["key1", "key2"] ), ) - result = df.groupby(["key1", "key2"]).agg(lambda x: x) tm.assert_frame_equal(result, expected) @@ -1672,9 +1711,7 @@ def test_groupby_agg_extension_timedelta_cumsum_with_named_aggregation(): } ) gb = df.groupby("grps") - msg = "using the non-aggregation func='cumsum' will raise" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = gb.agg(td=("td", "cumsum")) + result = gb.agg(td=("td", "cumsum")) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index ac2abba2e353f..aafd06e8f88cf 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -21,7 +21,6 @@ bdate_range, ) import pandas._testing as tm -from pandas.core.groupby.base import reduction_kernels @pytest.mark.parametrize( @@ -288,16 +287,8 @@ def test_read_only_buffer_source_agg(agg): ) df._mgr.arrays[0].flags.writeable = False - if agg in reduction_kernels: - warn = None - msg = "" - else: - warn = DeprecationWarning - msg = f"using the non-aggregation func='{agg}' will raise" - with tm.assert_produces_warning(warn, match=msg): - result = df.groupby(["species"]).agg({"sepal_length": agg}) - with tm.assert_produces_warning(warn, match=msg): - expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) + result = df.groupby(["species"]).agg({"sepal_length": agg}) + expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c6b1afc646554..4119e7a6feaed 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2936,11 +2936,9 @@ def test_groupby_dropna_with_nunique_unique(): df = [[1, 1, 1, "A"], [1, None, 1, "A"], [1, None, 2, "A"], [1, None, 3, "A"]] df_dropna = DataFrame(df, columns=["a", "b", "c", "partner"]) - msg = "using the non-aggregation func='unique' will raise" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df_dropna.groupby(["a", "b", "c"], dropna=False).agg( - {"partner": ["nunique", "unique"]} - ) + result = df_dropna.groupby(["a", "b", "c"], dropna=False).agg( + {"partner": ["nunique", "unique"]} + ) index = MultiIndex.from_tuples( [(1, 1.0, 1), (1, np.nan, 1), (1, np.nan, 2), (1, np.nan, 3)], diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 1012e32ef8923..f9d5de72eda1d 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -15,7 +15,6 @@ Series, ) import pandas._testing as tm -from pandas.core.groupby.base import reduction_kernels from pandas.tests.groupby import get_groupby_method_args @@ -85,10 +84,8 @@ def df_with_cat_col(): return df -def _call_and_check( - klass, msg, how, gb, groupby_func, args, warn_msg="", warn_category=FutureWarning -): - warn_klass = None if warn_msg == "" else warn_category +def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): + warn_klass = None if warn_msg == "" else FutureWarning with tm.assert_produces_warning(warn_klass, match=warn_msg, check_stacklevel=False): if klass is None: if how == "method": @@ -186,23 +183,9 @@ def test_groupby_raises_string( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func not in reduction_kernels and how == "agg": - warn_msg = ( - f"In the future, using the non-aggregation func='{groupby_func}' will " - "raise a ValueError" - ) else: warn_msg = "" - _call_and_check( - klass, - msg, - how, - gb, - groupby_func, - args, - warn_msg, - warn_category=DeprecationWarning, - ) + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -304,30 +287,12 @@ def test_groupby_raises_datetime( if groupby_func in ["any", "all"]: warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" - warn_category = FutureWarning elif groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" - warn_category = FutureWarning - elif groupby_func not in reduction_kernels and how == "agg": - warn_msg = ( - f"In the future, using the non-aggregation func='{groupby_func}' will " - "raise a ValueError" - ) - warn_category = DeprecationWarning else: warn_msg = "" - warn_category = FutureWarning - _call_and_check( - klass, - msg, - how, - gb, - groupby_func, - args, - warn_msg=warn_msg, - warn_category=warn_category, - ) + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -522,19 +487,9 @@ def test_groupby_raises_category( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" - warn_category = FutureWarning - elif groupby_func not in reduction_kernels and how == "agg": - warn_msg = ( - f"In the future, using the non-aggregation func='{groupby_func}' " - "will raise a ValueError" - ) - warn_category = DeprecationWarning else: warn_msg = "" - warn_category = FutureWarning - _call_and_check( - klass, msg, how, gb, groupby_func, args, warn_msg, warn_category=warn_category - ) + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -705,13 +660,6 @@ def test_groupby_raises_category_on_category( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" - warn_category = FutureWarning - elif groupby_func not in reduction_kernels and how == "agg": - warn_msg = f"using the non-aggregation func='{groupby_func}' will raise" - warn_category = DeprecationWarning else: warn_msg = "" - warn_category = FutureWarning - _call_and_check( - klass, msg, how, gb, groupby_func, args, warn_msg, warn_category=warn_category - ) + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) From b6fed214f19e6d7c777ecbd187c91d6832a811b9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 23 Mar 2024 13:32:00 -0400 Subject: [PATCH 8/8] pre-commit --- pandas/tests/groupby/test_reductions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 26d30624d9d2a..4ac7790cd2675 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -67,7 +67,7 @@ def test_basic_aggregations(dtype): # corner cases result = grouped.aggregate(lambda x: x * 2) - expected = pd.Series( + expected = Series( { 0: data[data.index // 3 == 0] * 2, 1: data[data.index // 3 == 1] * 2, @@ -76,6 +76,7 @@ def test_basic_aggregations(dtype): ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( "vals", [