From 7b6ab94720024d6696b19867f5f8f59f79587ff0 Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 18 Aug 2020 18:27:49 -0400 Subject: [PATCH 01/10] CLN: Decouple Series/DataFrame.transform --- pandas/core/frame.py | 17 ++- pandas/core/generic.py | 45 ++++++- pandas/core/series.py | 10 +- pandas/tests/frame/apply/test_frame_apply.py | 113 +++++++++++++++++- .../tests/series/apply/test_series_apply.py | 97 ++++++++++++++- 5 files changed, 271 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 606bd4cc3b52d..bb99223e0e05c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -113,7 +113,7 @@ ) from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna -from pandas.core import algorithms, common as com, nanops, ops +from pandas.core import algorithms, base, common as com, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.aggregation import reconstruct_func, relabel_result from pandas.core.arrays import Categorical, ExtensionArray @@ -7440,7 +7440,20 @@ def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame": axis = self._get_axis_number(axis) if axis == 1: return self.T.transform(func, *args, **kwargs).T - return super().transform(func, *args, **kwargs) + + if isinstance(func, list): + func = {col: func for col in self} + elif isinstance(func, dict): + cols = sorted(set(func.keys()) - set(self.columns)) + if len(cols) > 0: + raise base.SpecificationError(f"Column(s) {cols} do not exist") + if any(isinstance(v, dict) for v in func.values()): + # GH 15931 - deprecation of renaming keys + raise base.SpecificationError("nested renamer is not supported") + + result = self._transform(func, *args, **kwargs) + + return result def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fea3efedb6abb..a937bb5cd546f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10750,9 +10750,48 @@ def transform(self, func, *args, **kwargs): 1 1.000000 2.718282 2 1.414214 7.389056 """ - result = self.agg(func, *args, **kwargs) - if is_scalar(result) or len(result) != len(self): - raise ValueError("transforms cannot produce aggregated results") + raise NotImplementedError + + def _transform(self, func, *args, **kwargs): + if isinstance(func, dict): + results = {} + for name, how in func.items(): + colg = self._gotitem(name, ndim=1) + try: + results[name] = colg.transform(how, *args, **kwargs) + except Exception as e: + if str(e) == "Function did not transform": + raise e + + # combine results + if len(results) == 0: + raise ValueError("Transform function failed") + from pandas.core.reshape.concat import concat + + return concat(results, axis=1) + + try: + if isinstance(func, str): + result = self._try_aggregate_string_function(func, *args, **kwargs) + else: + f = self._get_cython_func(func) + if f and not args and not kwargs: + result = getattr(self, f)() + else: + try: + result = self.apply(func, args=args, **kwargs) + except Exception: + result = func(self, *args, **kwargs) + + except Exception: + raise ValueError("Transform function failed") + + # Functions that transform may return empty Series/DataFrame + # when the dtype is not appropriate + if isinstance(result, NDFrame) and result.empty: + raise ValueError("Transform function failed") + if not isinstance(result, NDFrame) or not result.index.equals(self.index): + raise ValueError("Function did not transform") return result diff --git a/pandas/core/series.py b/pandas/core/series.py index dbc105be3c62b..b482ff250291a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4083,7 +4083,15 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): def transform(self, func, axis=0, *args, **kwargs): # Validate the axis parameter self._get_axis_number(axis) - return super().transform(func, *args, **kwargs) + + if isinstance(func, list): + func = {com.get_callable_name(v) or v: v for v in func} + elif isinstance(func, dict): + if any(isinstance(v, dict) for v in func.values()): + raise base.SpecificationError("nested renamer is not supported") + + result = self._transform(func, *args, **kwargs) + return result def apply(self, func, convert_dtype=True, args=(), **kwds): """ diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 5a1e448beb40f..9b024a667c41b 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -2,6 +2,7 @@ from datetime import datetime from itertools import chain import operator +import re import warnings import numpy as np @@ -14,6 +15,7 @@ import pandas._testing as tm from pandas.core.apply import frame_apply from pandas.core.base import SpecificationError +from pandas.core.groupby.base import transformation_kernels @pytest.fixture @@ -1131,9 +1133,29 @@ def test_agg_transform(self, axis, float_frame): result = float_frame.transform([np.abs, "sqrt"], axis=axis) tm.assert_frame_equal(result, expected) + # UDF via apply + def func(x): + if isinstance(x, DataFrame): + raise ValueError + return x + 1 + + result = float_frame.transform(func, axis=axis) + expected = float_frame + 1 + tm.assert_frame_equal(result, expected) + + # UDF that maps DataFrame -> DataFrame + def func(x): + if not isinstance(x, DataFrame): + raise ValueError + return x + 1 + + result = float_frame.transform(func, axis=axis) + expected = float_frame + 1 + tm.assert_frame_equal(result, expected) + def test_transform_and_agg_err(self, axis, float_frame): # cannot both transform and agg - msg = "transforms cannot produce aggregated results" + msg = "Function did not transform" with pytest.raises(ValueError, match=msg): float_frame.transform(["max", "min"], axis=axis) @@ -1142,6 +1164,7 @@ def test_transform_and_agg_err(self, axis, float_frame): with np.errstate(all="ignore"): float_frame.agg(["max", "sqrt"], axis=axis) + msg = "Function did not transform" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): float_frame.transform(["max", "sqrt"], axis=axis) @@ -1221,6 +1244,9 @@ def test_agg_dict_nested_renaming_depr(self): with pytest.raises(SpecificationError, match=msg): df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) + with pytest.raises(SpecificationError, match=msg): + df.transform({"A": {"foo": "min"}, "B": {"bar": "max"}}) + def test_agg_reduce(self, axis, float_frame): other_axis = 1 if axis in {0, "index"} else 0 name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() @@ -1550,3 +1576,88 @@ def test_apply_empty_list_reduce(): result = df.apply(lambda x: [], result_type="reduce") expected = pd.Series({"a": [], "b": []}, dtype=object) tm.assert_series_equal(result, expected) + + +def test_transform_reducer_raises(all_reductions): + op = all_reductions + s = pd.DataFrame({"A": [1, 2, 3]}) + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + s.transform(op) + with pytest.raises(ValueError, match=msg): + s.transform([op]) + with pytest.raises(ValueError, match=msg): + s.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + s.transform({"A": [op]}) + + +# mypy doesn't allow adding lists of different types +# https://github.com/python/mypy/issues/5492 +@pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1]) +def test_transform_bad_dtype(op): + s = pd.DataFrame({"A": 3 * [object]}) # DataFrame that will fail on most transforms + if op in ("backfill", "shift", "pad", "bfill", "ffill"): + pytest.xfail("Transform function works on any datatype") + msg = "Transform function failed" + with pytest.raises(ValueError, match=msg): + s.transform(op) + with pytest.raises(ValueError, match=msg): + s.transform([op]) + with pytest.raises(ValueError, match=msg): + s.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + s.transform({"A": [op]}) + + +@pytest.mark.parametrize("op", transformation_kernels) +def test_transform_multi_dtypes(op): + df = pd.DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]}) + + # Determine which columns op will work on + columns = [] + for column in df: + try: + df[column].transform(op) + columns.append(column) + except Exception: + pass + + if len(columns) > 0: + expected = df[columns].transform([op]) + result = df.transform([op]) + tm.assert_equal(result, expected) + + expected = df[columns].transform({column: op for column in columns}) + result = df.transform({column: op for column in columns}) + tm.assert_equal(result, expected) + + expected = df[columns].transform({column: [op] for column in columns}) + result = df.transform({column: [op] for column in columns}) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("use_apply", [True, False]) +def test_transform_passes_args(use_apply): + # transform uses UDF either via apply or passing the entire DataFrame + expected_args = [1, 2] + expected_kwargs = {"c": 3} + + def f(x, a, b, c): + # transform is using apply iff x is not a DataFrame + if use_apply == isinstance(x, DataFrame): + # Force transform to fallback + raise ValueError + assert [a, b] == expected_args + assert c == expected_kwargs["c"] + return x + + pd.DataFrame([1]).transform(f, 0, *expected_args, **expected_kwargs) + + +@pytest.mark.parametrize("axis", [0, "index", 1, "columns"]) +def test_transform_missing_columns(axis): + df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + match = re.escape("Column(s) ['C'] do not exist") + with pytest.raises(SpecificationError, match=match): + df.transform({"C": "cumsum"}) diff --git a/pandas/tests/series/apply/test_series_apply.py b/pandas/tests/series/apply/test_series_apply.py index 308398642895c..d41a759f950de 100644 --- a/pandas/tests/series/apply/test_series_apply.py +++ b/pandas/tests/series/apply/test_series_apply.py @@ -8,6 +8,7 @@ from pandas import DataFrame, Index, MultiIndex, Series, isna import pandas._testing as tm from pandas.core.base import SpecificationError +from pandas.core.groupby.base import transformation_kernels class TestSeriesApply: @@ -222,7 +223,7 @@ def test_transform(self, string_series): expected.columns = ["sqrt"] tm.assert_frame_equal(result, expected) - result = string_series.transform([np.sqrt]) + result = string_series.apply([np.sqrt]) tm.assert_frame_equal(result, expected) result = string_series.transform(["sqrt"]) @@ -248,9 +249,34 @@ def test_transform(self, string_series): result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) tm.assert_series_equal(result.reindex_like(expected), expected) + expected = pd.concat([f_sqrt, f_abs], axis=1) + expected.columns = ["foo", "bar"] + result = string_series.transform({"foo": np.sqrt, "bar": np.abs}) + tm.assert_frame_equal(result, expected) + + # UDF via apply + def func(x): + if isinstance(x, Series): + raise ValueError + return x + 1 + + result = string_series.transform(func) + expected = string_series + 1 + tm.assert_series_equal(result, expected) + + # UDF that maps Series -> Series + def func(x): + if not isinstance(x, Series): + raise ValueError + return x + 1 + + result = string_series.transform(func) + expected = string_series + 1 + tm.assert_series_equal(result, expected) + def test_transform_and_agg_error(self, string_series): # we are trying to transform with an aggregator - msg = "transforms cannot produce aggregated results" + msg = "Function did not transform" with pytest.raises(ValueError, match=msg): string_series.transform(["min", "max"]) @@ -259,6 +285,7 @@ def test_transform_and_agg_error(self, string_series): with np.errstate(all="ignore"): string_series.agg(["sqrt", "max"]) + msg = "Function did not transform" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): string_series.transform(["sqrt", "max"]) @@ -467,11 +494,73 @@ def test_transform_none_to_type(self): # GH34377 df = pd.DataFrame({"a": [None]}) - msg = "DataFrame constructor called with incompatible data and dtype" - with pytest.raises(TypeError, match=msg): + msg = "Transform function failed.*" + with pytest.raises(ValueError, match=msg): df.transform({"a": int}) +def test_transform_reducer_raises(all_reductions): + op = all_reductions + s = pd.Series([1, 2, 3]) + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + s.transform(op) + with pytest.raises(ValueError, match=msg): + s.transform([op]) + with pytest.raises(ValueError, match=msg): + s.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + s.transform({"A": [op]}) + + +# mypy doesn't allow adding lists of different types +# https://github.com/python/mypy/issues/5492 +@pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1]) +def test_transform_bad_dtype(op): + s = pd.Series(3 * [object]) # Series that will fail on most transforms + if op in ("backfill", "shift", "pad", "bfill", "ffill"): + pytest.xfail("Transform function works on any datatype") + msg = "Transform function failed" + with pytest.raises(ValueError, match=msg): + s.transform(op) + with pytest.raises(ValueError, match=msg): + s.transform([op]) + with pytest.raises(ValueError, match=msg): + s.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + s.transform({"A": [op]}) + + +@pytest.mark.parametrize("use_apply", [True, False]) +def test_transform_passes_args(use_apply): + # transform uses UDF either via apply or passing the entire Series + expected_args = [1, 2] + expected_kwargs = {"c": 3} + + def f(x, a, b, c): + # transform is using apply iff x is not a Series + if use_apply == isinstance(x, Series): + # Force transform to fallback + raise ValueError + assert [a, b] == expected_args + assert c == expected_kwargs["c"] + return x + + pd.Series([1]).transform(f, 0, *expected_args, **expected_kwargs) + + +def test_transform_axis_1_raises(): + msg = "No axis named 1 for object type Series" + with pytest.raises(ValueError, match=msg): + pd.Series([1]).transform("sum", axis=1) + + +def test_transform_nested_renamer(): + match = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=match): + pd.Series([1]).transform({"A": {"B": ["sum"]}}) + + class TestSeriesMap: def test_map(self, datetime_series): index, data = tm.getMixedTypeDict() From 04c1238c18e881cc222b4df7498fdf6a2dc169ef Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Fri, 4 Sep 2020 22:17:18 -0400 Subject: [PATCH 02/10] Broke out tests to separate modules; moved code to aggregation --- pandas/core/aggregation.py | 108 ++++++++++- pandas/core/base.py | 4 +- pandas/core/frame.py | 22 +-- pandas/core/generic.py | 43 ----- pandas/core/series.py | 12 +- pandas/tests/frame/apply/test_frame_apply.py | 157 +--------------- .../tests/frame/apply/test_frame_transform.py | 175 ++++++++++++++++++ pandas/tests/frame/common.py | 22 +++ .../tests/series/apply/test_series_apply.py | 121 +----------- .../series/apply/test_series_transform.py | 145 +++++++++++++++ 10 files changed, 458 insertions(+), 351 deletions(-) create mode 100644 pandas/tests/frame/apply/test_frame_transform.py create mode 100644 pandas/tests/series/apply/test_series_transform.py diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index e2374b81ca13b..9e99845e0eda5 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -20,11 +20,12 @@ from pandas._typing import AggFuncType, Label from pandas.core.dtypes.common import is_dict_like, is_list_like +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.indexes.api import Index -from pandas.core.series import FrameOrSeriesUnion, Series +from pandas.core.series import FrameOrSeriesUnion def reconstruct_func( @@ -280,7 +281,7 @@ def relabel_result( func: Dict[str, List[Union[Callable, str]]], columns: Tuple, order: List[int], -) -> Dict[Label, Series]: +) -> Dict[Label, ABCSeries]: """Internal function to reorder result if relabelling is True for dataframe.agg, and return the reordered result in dict. @@ -306,10 +307,10 @@ def relabel_result( reordered_indexes = [ pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) ] - reordered_result_in_dict: Dict[Label, Series] = {} + reordered_result_in_dict: Dict[Label, ABCSeries] = {} idx = 0 - reorder_mask = not isinstance(result, Series) and len(result.columns) > 1 + reorder_mask = not isinstance(result, ABCSeries) and len(result.columns) > 1 for col, fun in func.items(): s = result[col].dropna() @@ -382,3 +383,102 @@ def validate_func_kwargs( if not columns: raise TypeError(no_arg_message) return columns, func + + +def transform( + obj: FrameOrSeriesUnion, + func: Union[str, List, Dict, Callable], + axis: int, + *args, + **kwargs, +) -> FrameOrSeriesUnion: + """ + Transform a DataFrame or Series + + Parameters + ---------- + obj : DataFrame or Series + Object to compute the transform on. + func : string, function, list, or dictionary + Function(s) to compute the transform with. + axis : {0 or 'index', 1 or 'columns'} + Axis along which the function is applied: + + * 0 or 'index': apply function to each column. + * 1 or 'columns': apply function to each row. + + Returns + ------- + DataFrame or Series + Result of applying ``func`` along the given axis of the + Series or DataFrame. + + Raises + ------ + ValueError + If the transform function fails or does not transform. + """ + is_series = obj.ndim == 1 + + if obj._get_axis_number(axis) == 1: + assert not is_series + return transform(obj.T, func, 0, *args, **kwargs).T + + if isinstance(func, list): + if is_series: + func = {com.get_callable_name(v) or v: v for v in func} + else: + func = {col: func for col in obj} + + if isinstance(func, dict): + if not is_series: + cols = sorted(set(func.keys()) - set(obj.columns)) + if len(cols) > 0: + raise SpecificationError(f"Column(s) {cols} do not exist") + + if any(isinstance(v, dict) for v in func.values()): + # GH 15931 - deprecation of renaming keys + raise SpecificationError("nested renamer is not supported") + + results = {} + for name, how in func.items(): + colg = obj._gotitem(name, ndim=1) + try: + results[name] = transform(colg, how, 0, *args, **kwargs) + except Exception as e: + if str(e) == "Function did not transform": + raise e + + # combine results + if len(results) == 0: + raise ValueError("Transform function failed") + from pandas.core.reshape.concat import concat + + return concat(results, axis=1) + + # func is either str or callable + try: + if isinstance(func, str): + result = obj._try_aggregate_string_function(func, *args, **kwargs) + else: + f = obj._get_cython_func(func) + if f and not args and not kwargs: + result = getattr(obj, f)() + else: + try: + result = obj.apply(func, args=args, **kwargs) + except Exception: + result = func(obj, *args, **kwargs) + except Exception: + raise ValueError("Transform function failed") + + # Functions that transform may return empty Series/DataFrame + # when the dtype is not appropriate + if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: + raise ValueError("Transform function failed") + if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( + obj.index + ): + raise ValueError("Function did not transform") + + return result diff --git a/pandas/core/base.py b/pandas/core/base.py index b62ef668df5e1..b31a3f0407717 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,7 +4,7 @@ import builtins import textwrap -from typing import Any, Dict, FrozenSet, List, Optional, Union +from typing import Any, Callable, Dict, FrozenSet, List, Optional, Union import numpy as np @@ -560,7 +560,7 @@ def _aggregate_multiple_funcs(self, arg, _axis): ) from err return result - def _get_cython_func(self, arg: str) -> Optional[str]: + def _get_cython_func(self, arg: Callable) -> Optional[str]: """ if we define an internal function for this argument, return it """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bb99223e0e05c..773d564e08799 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -113,9 +113,9 @@ ) from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna -from pandas.core import algorithms, base, common as com, nanops, ops +from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.aggregation import reconstruct_func, relabel_result +from pandas.core.aggregation import reconstruct_func, relabel_result, transform from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor @@ -7437,22 +7437,8 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): axis=_shared_doc_kwargs["axis"], ) def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame": - axis = self._get_axis_number(axis) - if axis == 1: - return self.T.transform(func, *args, **kwargs).T - - if isinstance(func, list): - func = {col: func for col in self} - elif isinstance(func, dict): - cols = sorted(set(func.keys()) - set(self.columns)) - if len(cols) > 0: - raise base.SpecificationError(f"Column(s) {cols} do not exist") - if any(isinstance(v, dict) for v in func.values()): - # GH 15931 - deprecation of renaming keys - raise base.SpecificationError("nested renamer is not supported") - - result = self._transform(func, *args, **kwargs) - + result = transform(self, func, axis, *args, **kwargs) + assert isinstance(result, DataFrame) return result def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a937bb5cd546f..722fd063927c4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10752,49 +10752,6 @@ def transform(self, func, *args, **kwargs): """ raise NotImplementedError - def _transform(self, func, *args, **kwargs): - if isinstance(func, dict): - results = {} - for name, how in func.items(): - colg = self._gotitem(name, ndim=1) - try: - results[name] = colg.transform(how, *args, **kwargs) - except Exception as e: - if str(e) == "Function did not transform": - raise e - - # combine results - if len(results) == 0: - raise ValueError("Transform function failed") - from pandas.core.reshape.concat import concat - - return concat(results, axis=1) - - try: - if isinstance(func, str): - result = self._try_aggregate_string_function(func, *args, **kwargs) - else: - f = self._get_cython_func(func) - if f and not args and not kwargs: - result = getattr(self, f)() - else: - try: - result = self.apply(func, args=args, **kwargs) - except Exception: - result = func(self, *args, **kwargs) - - except Exception: - raise ValueError("Transform function failed") - - # Functions that transform may return empty Series/DataFrame - # when the dtype is not appropriate - if isinstance(result, NDFrame) and result.empty: - raise ValueError("Transform function failed") - if not isinstance(result, NDFrame) or not result.index.equals(self.index): - raise ValueError("Function did not transform") - - return result - # ---------------------------------------------------------------------- # Misc methods diff --git a/pandas/core/series.py b/pandas/core/series.py index b482ff250291a..6051df1bec9b1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4081,17 +4081,9 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): axis=_shared_doc_kwargs["axis"], ) def transform(self, func, axis=0, *args, **kwargs): - # Validate the axis parameter - self._get_axis_number(axis) + from pandas.core.aggregation import transform - if isinstance(func, list): - func = {com.get_callable_name(v) or v: v for v in func} - elif isinstance(func, dict): - if any(isinstance(v, dict) for v in func.values()): - raise base.SpecificationError("nested renamer is not supported") - - result = self._transform(func, *args, **kwargs) - return result + return transform(self, func, axis, *args, **kwargs) def apply(self, func, convert_dtype=True, args=(), **kwds): """ diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 9b024a667c41b..6084396352e41 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1,8 +1,6 @@ from collections import OrderedDict from datetime import datetime from itertools import chain -import operator -import re import warnings import numpy as np @@ -15,7 +13,7 @@ import pandas._testing as tm from pandas.core.apply import frame_apply from pandas.core.base import SpecificationError -from pandas.core.groupby.base import transformation_kernels +from pandas.tests.frame.common import zip_frames @pytest.fixture @@ -1060,25 +1058,6 @@ def test_consistency_for_boxed(self, box, int_frame_const_col): tm.assert_frame_equal(result, expected) -def zip_frames(frames, axis=1): - """ - take a list of frames, zip them together under the - assumption that these all have the first frames' index/columns. - - Returns - ------- - new_frame : DataFrame - """ - if axis == 1: - columns = frames[0].columns - zipped = [f.loc[:, c] for c in columns for f in frames] - return pd.concat(zipped, axis=1) - else: - index = frames[0].index - zipped = [f.loc[i, :] for i in index for f in frames] - return pd.DataFrame(zipped) - - class TestDataFrameAggregate: def test_agg_transform(self, axis, float_frame): other_axis = 1 if axis in {0, "index"} else 0 @@ -1089,10 +1068,7 @@ def test_agg_transform(self, axis, float_frame): f_sqrt = np.sqrt(float_frame) # ufunc - result = float_frame.transform(np.sqrt, axis=axis) expected = f_sqrt.copy() - tm.assert_frame_equal(result, expected) - result = float_frame.apply(np.sqrt, axis=axis) tm.assert_frame_equal(result, expected) @@ -1112,9 +1088,6 @@ def test_agg_transform(self, axis, float_frame): ) tm.assert_frame_equal(result, expected) - result = float_frame.transform([np.sqrt], axis=axis) - tm.assert_frame_equal(result, expected) - # multiple items in list # these are in the order as if we are applying both # functions per series and then concatting @@ -1130,59 +1103,19 @@ def test_agg_transform(self, axis, float_frame): ) tm.assert_frame_equal(result, expected) - result = float_frame.transform([np.abs, "sqrt"], axis=axis) - tm.assert_frame_equal(result, expected) - - # UDF via apply - def func(x): - if isinstance(x, DataFrame): - raise ValueError - return x + 1 - - result = float_frame.transform(func, axis=axis) - expected = float_frame + 1 - tm.assert_frame_equal(result, expected) - - # UDF that maps DataFrame -> DataFrame - def func(x): - if not isinstance(x, DataFrame): - raise ValueError - return x + 1 - - result = float_frame.transform(func, axis=axis) - expected = float_frame + 1 - tm.assert_frame_equal(result, expected) - def test_transform_and_agg_err(self, axis, float_frame): # cannot both transform and agg - msg = "Function did not transform" - with pytest.raises(ValueError, match=msg): - float_frame.transform(["max", "min"], axis=axis) - msg = "cannot combine transform and aggregation operations" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): float_frame.agg(["max", "sqrt"], axis=axis) - msg = "Function did not transform" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - float_frame.transform(["max", "sqrt"], axis=axis) - df = pd.DataFrame({"A": range(5), "B": 5}) def f(): with np.errstate(all="ignore"): df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) - @pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) - def test_transform_method_name(self, method): - # GH 19760 - df = pd.DataFrame({"A": [-1, 2]}) - result = df.transform(method) - expected = operator.methodcaller(method)(df) - tm.assert_frame_equal(result, expected) - def test_demo(self): # demonstration tests df = pd.DataFrame({"A": range(5), "B": 5}) @@ -1244,9 +1177,6 @@ def test_agg_dict_nested_renaming_depr(self): with pytest.raises(SpecificationError, match=msg): df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) - with pytest.raises(SpecificationError, match=msg): - df.transform({"A": {"foo": "min"}, "B": {"bar": "max"}}) - def test_agg_reduce(self, axis, float_frame): other_axis = 1 if axis in {0, "index"} else 0 name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() @@ -1576,88 +1506,3 @@ def test_apply_empty_list_reduce(): result = df.apply(lambda x: [], result_type="reduce") expected = pd.Series({"a": [], "b": []}, dtype=object) tm.assert_series_equal(result, expected) - - -def test_transform_reducer_raises(all_reductions): - op = all_reductions - s = pd.DataFrame({"A": [1, 2, 3]}) - msg = "Function did not transform" - with pytest.raises(ValueError, match=msg): - s.transform(op) - with pytest.raises(ValueError, match=msg): - s.transform([op]) - with pytest.raises(ValueError, match=msg): - s.transform({"A": op}) - with pytest.raises(ValueError, match=msg): - s.transform({"A": [op]}) - - -# mypy doesn't allow adding lists of different types -# https://github.com/python/mypy/issues/5492 -@pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1]) -def test_transform_bad_dtype(op): - s = pd.DataFrame({"A": 3 * [object]}) # DataFrame that will fail on most transforms - if op in ("backfill", "shift", "pad", "bfill", "ffill"): - pytest.xfail("Transform function works on any datatype") - msg = "Transform function failed" - with pytest.raises(ValueError, match=msg): - s.transform(op) - with pytest.raises(ValueError, match=msg): - s.transform([op]) - with pytest.raises(ValueError, match=msg): - s.transform({"A": op}) - with pytest.raises(ValueError, match=msg): - s.transform({"A": [op]}) - - -@pytest.mark.parametrize("op", transformation_kernels) -def test_transform_multi_dtypes(op): - df = pd.DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]}) - - # Determine which columns op will work on - columns = [] - for column in df: - try: - df[column].transform(op) - columns.append(column) - except Exception: - pass - - if len(columns) > 0: - expected = df[columns].transform([op]) - result = df.transform([op]) - tm.assert_equal(result, expected) - - expected = df[columns].transform({column: op for column in columns}) - result = df.transform({column: op for column in columns}) - tm.assert_equal(result, expected) - - expected = df[columns].transform({column: [op] for column in columns}) - result = df.transform({column: [op] for column in columns}) - tm.assert_equal(result, expected) - - -@pytest.mark.parametrize("use_apply", [True, False]) -def test_transform_passes_args(use_apply): - # transform uses UDF either via apply or passing the entire DataFrame - expected_args = [1, 2] - expected_kwargs = {"c": 3} - - def f(x, a, b, c): - # transform is using apply iff x is not a DataFrame - if use_apply == isinstance(x, DataFrame): - # Force transform to fallback - raise ValueError - assert [a, b] == expected_args - assert c == expected_kwargs["c"] - return x - - pd.DataFrame([1]).transform(f, 0, *expected_args, **expected_kwargs) - - -@pytest.mark.parametrize("axis", [0, "index", 1, "columns"]) -def test_transform_missing_columns(axis): - df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) - match = re.escape("Column(s) ['C'] do not exist") - with pytest.raises(SpecificationError, match=match): - df.transform({"C": "cumsum"}) diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py new file mode 100644 index 0000000000000..fc2c5b93d3d08 --- /dev/null +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -0,0 +1,175 @@ +import operator +import re + +import numpy as np +import pytest + +from pandas import DataFrame, MultiIndex +import pandas._testing as tm +from pandas.core.base import SpecificationError +from pandas.core.groupby.base import transformation_kernels +from pandas.tests.frame.common import zip_frames + + +def test_agg_transform(axis, float_frame): + other_axis = 1 if axis in {0, "index"} else 0 + + f_abs = np.abs(float_frame) + f_sqrt = np.sqrt(float_frame) + + # ufunc + result = float_frame.transform(np.sqrt, axis=axis) + expected = f_sqrt + tm.assert_frame_equal(result, expected) + + # list-like + expected = f_sqrt.copy() + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]]) + else: + expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]]) + result = float_frame.transform([np.sqrt], axis=axis) + tm.assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both + # functions per series and then concatting + expected = zip_frames([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [float_frame.columns, ["absolute", "sqrt"]] + ) + else: + expected.index = MultiIndex.from_product( + [float_frame.index, ["absolute", "sqrt"]] + ) + result = float_frame.transform([np.abs, "sqrt"], axis=axis) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("use_apply", [True, False]) +def test_transform_udf(axis, float_frame, use_apply): + # transform uses UDF either via apply or passing the entire DataFrame + def func(x): + # transform is using apply iff x is not a DataFrame + if use_apply == isinstance(x, DataFrame): + # Force transform to fallback + raise ValueError + return x + 1 + + result = float_frame.transform(func, axis=axis) + expected = float_frame + 1 + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) +def test_transform_method_name(method): + # GH 19760 + df = DataFrame({"A": [-1, 2]}) + result = df.transform(method) + expected = operator.methodcaller(method)(df) + tm.assert_frame_equal(result, expected) + + +def test_transform_and_agg_err(axis, float_frame): + # cannot both transform and agg + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + float_frame.transform(["max", "min"], axis=axis) + + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + float_frame.transform(["max", "sqrt"], axis=axis) + + +def test_agg_dict_nested_renaming_depr(): + df = DataFrame({"A": range(5), "B": 5}) + + # nested renaming + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + df.transform({"A": {"foo": "min"}, "B": {"bar": "max"}}) + + +def test_transform_reducer_raises(all_reductions): + op = all_reductions + df = DataFrame({"A": [1, 2, 3]}) + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + df.transform(op) + with pytest.raises(ValueError, match=msg): + df.transform([op]) + with pytest.raises(ValueError, match=msg): + df.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + df.transform({"A": [op]}) + + +# mypy doesn't allow adding lists of different types +# https://github.com/python/mypy/issues/5492 +@pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1]) +def test_transform_bad_dtype(op): + df = DataFrame({"A": 3 * [object]}) # DataFrame that will fail on most transforms + if op in ("backfill", "shift", "pad", "bfill", "ffill"): + pytest.xfail("Transform function works on any datatype") + msg = "Transform function failed" + with pytest.raises(ValueError, match=msg): + df.transform(op) + with pytest.raises(ValueError, match=msg): + df.transform([op]) + with pytest.raises(ValueError, match=msg): + df.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + df.transform({"A": [op]}) + + +@pytest.mark.parametrize("op", transformation_kernels) +def test_transform_multi_dtypes(op): + df = DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]}) + + # Determine which columns op will work on + columns = [] + for column in df: + try: + df[column].transform(op) + columns.append(column) + except Exception: + pass + + if len(columns) > 0: + expected = df[columns].transform([op]) + result = df.transform([op]) + tm.assert_equal(result, expected) + + expected = df[columns].transform({column: op for column in columns}) + result = df.transform({column: op for column in columns}) + tm.assert_equal(result, expected) + + expected = df[columns].transform({column: [op] for column in columns}) + result = df.transform({column: [op] for column in columns}) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("use_apply", [True, False]) +def test_transform_passes_args(use_apply): + # transform uses UDF either via apply or passing the entire DataFrame + expected_args = [1, 2] + expected_kwargs = {"c": 3} + + def f(x, a, b, c): + # transform is using apply iff x is not a DataFrame + if use_apply == isinstance(x, DataFrame): + # Force transform to fallback + raise ValueError + assert [a, b] == expected_args + assert c == expected_kwargs["c"] + return x + + DataFrame([1]).transform(f, 0, *expected_args, **expected_kwargs) + + +def test_transform_missing_columns(axis): + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + match = re.escape("Column(s) ['C'] do not exist") + with pytest.raises(SpecificationError, match=match): + df.transform({"C": "cumsum"}) diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 463a140972ab5..50e8c8632a525 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -1,3 +1,6 @@ +from pandas import DataFrame, concat + + def _check_mixed_float(df, dtype=None): # float16 are most likely to be upcasted to float32 dtypes = dict(A="float32", B="float32", C="float16", D="float64") @@ -29,3 +32,22 @@ def _check_mixed_int(df, dtype=None): assert df.dtypes["C"] == dtypes["C"] if dtypes.get("D"): assert df.dtypes["D"] == dtypes["D"] + + +def zip_frames(frames, axis=1): + """ + take a list of frames, zip them together under the + assumption that these all have the first frames' index/columns. + + Returns + ------- + new_frame : DataFrame + """ + if axis == 1: + columns = frames[0].columns + zipped = [f.loc[:, c] for c in columns for f in frames] + return concat(zipped, axis=1) + else: + index = frames[0].index + zipped = [f.loc[i, :] for i in index for f in frames] + return DataFrame(zipped) diff --git a/pandas/tests/series/apply/test_series_apply.py b/pandas/tests/series/apply/test_series_apply.py index d41a759f950de..827f466e23106 100644 --- a/pandas/tests/series/apply/test_series_apply.py +++ b/pandas/tests/series/apply/test_series_apply.py @@ -8,7 +8,6 @@ from pandas import DataFrame, Index, MultiIndex, Series, isna import pandas._testing as tm from pandas.core.base import SpecificationError -from pandas.core.groupby.base import transformation_kernels class TestSeriesApply: @@ -210,23 +209,17 @@ def test_transform(self, string_series): f_abs = np.abs(string_series) # ufunc - result = string_series.transform(np.sqrt) - expected = f_sqrt.copy() - tm.assert_series_equal(result, expected) - result = string_series.apply(np.sqrt) + expected = f_sqrt.copy() tm.assert_series_equal(result, expected) # list-like - result = string_series.transform([np.sqrt]) + result = string_series.apply([np.sqrt]) expected = f_sqrt.to_frame().copy() expected.columns = ["sqrt"] tm.assert_frame_equal(result, expected) - result = string_series.apply([np.sqrt]) - tm.assert_frame_equal(result, expected) - - result = string_series.transform(["sqrt"]) + result = string_series.apply(["sqrt"]) tm.assert_frame_equal(result, expected) # multiple items in list @@ -237,10 +230,6 @@ def test_transform(self, string_series): result = string_series.apply([np.sqrt, np.abs]) tm.assert_frame_equal(result, expected) - result = string_series.transform(["sqrt", "abs"]) - expected.columns = ["sqrt", "abs"] - tm.assert_frame_equal(result, expected) - # dict, provide renaming expected = pd.concat([f_sqrt, f_abs], axis=1) expected.columns = ["foo", "bar"] @@ -249,47 +238,13 @@ def test_transform(self, string_series): result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) tm.assert_series_equal(result.reindex_like(expected), expected) - expected = pd.concat([f_sqrt, f_abs], axis=1) - expected.columns = ["foo", "bar"] - result = string_series.transform({"foo": np.sqrt, "bar": np.abs}) - tm.assert_frame_equal(result, expected) - - # UDF via apply - def func(x): - if isinstance(x, Series): - raise ValueError - return x + 1 - - result = string_series.transform(func) - expected = string_series + 1 - tm.assert_series_equal(result, expected) - - # UDF that maps Series -> Series - def func(x): - if not isinstance(x, Series): - raise ValueError - return x + 1 - - result = string_series.transform(func) - expected = string_series + 1 - tm.assert_series_equal(result, expected) - def test_transform_and_agg_error(self, string_series): # we are trying to transform with an aggregator - msg = "Function did not transform" - with pytest.raises(ValueError, match=msg): - string_series.transform(["min", "max"]) - msg = "cannot combine transform and aggregation" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): string_series.agg(["sqrt", "max"]) - msg = "Function did not transform" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - string_series.transform(["sqrt", "max"]) - msg = "cannot perform both aggregation and transformation" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): @@ -490,76 +445,6 @@ def test_agg_cython_table_raises(self, series, func, expected): # e.g. Series('a b'.split()).cumprod() will raise series.agg(func) - def test_transform_none_to_type(self): - # GH34377 - df = pd.DataFrame({"a": [None]}) - - msg = "Transform function failed.*" - with pytest.raises(ValueError, match=msg): - df.transform({"a": int}) - - -def test_transform_reducer_raises(all_reductions): - op = all_reductions - s = pd.Series([1, 2, 3]) - msg = "Function did not transform" - with pytest.raises(ValueError, match=msg): - s.transform(op) - with pytest.raises(ValueError, match=msg): - s.transform([op]) - with pytest.raises(ValueError, match=msg): - s.transform({"A": op}) - with pytest.raises(ValueError, match=msg): - s.transform({"A": [op]}) - - -# mypy doesn't allow adding lists of different types -# https://github.com/python/mypy/issues/5492 -@pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1]) -def test_transform_bad_dtype(op): - s = pd.Series(3 * [object]) # Series that will fail on most transforms - if op in ("backfill", "shift", "pad", "bfill", "ffill"): - pytest.xfail("Transform function works on any datatype") - msg = "Transform function failed" - with pytest.raises(ValueError, match=msg): - s.transform(op) - with pytest.raises(ValueError, match=msg): - s.transform([op]) - with pytest.raises(ValueError, match=msg): - s.transform({"A": op}) - with pytest.raises(ValueError, match=msg): - s.transform({"A": [op]}) - - -@pytest.mark.parametrize("use_apply", [True, False]) -def test_transform_passes_args(use_apply): - # transform uses UDF either via apply or passing the entire Series - expected_args = [1, 2] - expected_kwargs = {"c": 3} - - def f(x, a, b, c): - # transform is using apply iff x is not a Series - if use_apply == isinstance(x, Series): - # Force transform to fallback - raise ValueError - assert [a, b] == expected_args - assert c == expected_kwargs["c"] - return x - - pd.Series([1]).transform(f, 0, *expected_args, **expected_kwargs) - - -def test_transform_axis_1_raises(): - msg = "No axis named 1 for object type Series" - with pytest.raises(ValueError, match=msg): - pd.Series([1]).transform("sum", axis=1) - - -def test_transform_nested_renamer(): - match = "nested renamer is not supported" - with pytest.raises(SpecificationError, match=match): - pd.Series([1]).transform({"A": {"B": ["sum"]}}) - class TestSeriesMap: def test_map(self, datetime_series): diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py new file mode 100644 index 0000000000000..351bd11f5aff7 --- /dev/null +++ b/pandas/tests/series/apply/test_series_transform.py @@ -0,0 +1,145 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series, concat +import pandas._testing as tm +from pandas.core.base import SpecificationError +from pandas.core.groupby.base import transformation_kernels + + +def test_transform(string_series): + # transforming functions + f_sqrt = np.sqrt(string_series) + f_abs = np.abs(string_series) + + # ufunc + result = string_series.transform(np.sqrt) + expected = f_sqrt.copy() + tm.assert_series_equal(result, expected) + + # list-like + result = string_series.transform([np.sqrt]) + expected = f_sqrt.to_frame().copy() + expected.columns = ["sqrt"] + tm.assert_frame_equal(result, expected) + + result = string_series.transform(["sqrt"]) + tm.assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both functions per + # series and then concatting + result = string_series.transform(["sqrt", "abs"]) + expected = concat([f_sqrt, f_abs], axis=1) + expected.columns = ["sqrt", "abs"] + tm.assert_frame_equal(result, expected) + + # dict, provide renaming + expected = concat([f_sqrt, f_abs], axis=1) + expected.columns = ["foo", "bar"] + result = string_series.transform({"foo": np.sqrt, "bar": np.abs}) + tm.assert_frame_equal(result, expected) + + +def test_transform_udf(axis, string_series): + # via apply + def func(x): + if isinstance(x, Series): + raise ValueError + return x + 1 + + result = string_series.transform(func) + expected = string_series + 1 + tm.assert_series_equal(result, expected) + + # via map Series -> Series + def func(x): + if not isinstance(x, Series): + raise ValueError + return x + 1 + + result = string_series.transform(func) + expected = string_series + 1 + tm.assert_series_equal(result, expected) + + +def test_transform_wont_agg(string_series): + # we are trying to transform with an aggregator + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + string_series.transform(["min", "max"]) + + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + string_series.transform(["sqrt", "max"]) + + +def test_transform_none_to_type(): + # GH34377 + df = DataFrame({"a": [None]}) + msg = "Transform function failed" + with pytest.raises(ValueError, match=msg): + df.transform({"a": int}) + + +def test_transform_reducer_raises(all_reductions): + op = all_reductions + s = Series([1, 2, 3]) + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + s.transform(op) + with pytest.raises(ValueError, match=msg): + s.transform([op]) + with pytest.raises(ValueError, match=msg): + s.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + s.transform({"A": [op]}) + + +# mypy doesn't allow adding lists of different types +# https://github.com/python/mypy/issues/5492 +@pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1]) +def test_transform_bad_dtype(op): + s = Series(3 * [object]) # Series that will fail on most transforms + if op in ("backfill", "shift", "pad", "bfill", "ffill"): + pytest.xfail("Transform function works on any datatype") + msg = "Transform function failed" + with pytest.raises(ValueError, match=msg): + s.transform(op) + with pytest.raises(ValueError, match=msg): + s.transform([op]) + with pytest.raises(ValueError, match=msg): + s.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + s.transform({"A": [op]}) + + +@pytest.mark.parametrize("use_apply", [True, False]) +def test_transform_passes_args(use_apply): + # transform uses UDF either via apply or passing the entire Series + expected_args = [1, 2] + expected_kwargs = {"c": 3} + + def f(x, a, b, c): + # transform is using apply iff x is not a Series + if use_apply == isinstance(x, Series): + # Force transform to fallback + raise ValueError + assert [a, b] == expected_args + assert c == expected_kwargs["c"] + return x + + Series([1]).transform(f, 0, *expected_args, **expected_kwargs) + + +def test_transform_axis_1_raises(): + msg = "No axis named 1 for object type Series" + with pytest.raises(ValueError, match=msg): + Series([1]).transform("sum", axis=1) + + +def test_transform_nested_renamer(): + match = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=match): + Series([1]).transform({"A": {"B": ["sum"]}}) From 7b138111cac52b4e90457f67add4c081b2426d33 Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Fri, 4 Sep 2020 22:37:41 -0400 Subject: [PATCH 03/10] Fixed type-hints --- pandas/core/aggregation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index c93049dbe779f..b1ec7f6f84ecc 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -388,12 +388,12 @@ def validate_func_kwargs( def transform( - obj: FrameOrSeriesUnion, + obj: FrameOrSeries, func: Union[str, List, Dict, Callable], axis: int, *args, **kwargs, -) -> FrameOrSeriesUnion: +) -> FrameOrSeries: """ Transform a DataFrame or Series From 133bfaa48e285b48d624fa3d21cee3d3f5bb89ff Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Fri, 4 Sep 2020 23:28:25 -0400 Subject: [PATCH 04/10] Fixed tests - added context manager for np errors --- .../tests/frame/apply/test_frame_transform.py | 65 ++++++++++--------- .../series/apply/test_series_transform.py | 63 +++++++++--------- 2 files changed, 65 insertions(+), 63 deletions(-) diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py index fc2c5b93d3d08..4ae4384ee936e 100644 --- a/pandas/tests/frame/apply/test_frame_transform.py +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -11,40 +11,41 @@ from pandas.tests.frame.common import zip_frames -def test_agg_transform(axis, float_frame): +def test_transform(axis, float_frame): other_axis = 1 if axis in {0, "index"} else 0 - f_abs = np.abs(float_frame) - f_sqrt = np.sqrt(float_frame) - - # ufunc - result = float_frame.transform(np.sqrt, axis=axis) - expected = f_sqrt - tm.assert_frame_equal(result, expected) - - # list-like - expected = f_sqrt.copy() - if axis in {0, "index"}: - expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]]) - else: - expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]]) - result = float_frame.transform([np.sqrt], axis=axis) - tm.assert_frame_equal(result, expected) - - # multiple items in list - # these are in the order as if we are applying both - # functions per series and then concatting - expected = zip_frames([f_abs, f_sqrt], axis=other_axis) - if axis in {0, "index"}: - expected.columns = MultiIndex.from_product( - [float_frame.columns, ["absolute", "sqrt"]] - ) - else: - expected.index = MultiIndex.from_product( - [float_frame.index, ["absolute", "sqrt"]] - ) - result = float_frame.transform([np.abs, "sqrt"], axis=axis) - tm.assert_frame_equal(result, expected) + with np.errstate(all="ignore"): + f_abs = np.abs(float_frame) + f_sqrt = np.sqrt(float_frame) + + # ufunc + result = float_frame.transform(np.sqrt, axis=axis) + expected = f_sqrt + tm.assert_frame_equal(result, expected) + + # list-like + expected = f_sqrt.copy() + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]]) + else: + expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]]) + result = float_frame.transform([np.sqrt], axis=axis) + tm.assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both + # functions per series and then concatting + expected = zip_frames([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [float_frame.columns, ["absolute", "sqrt"]] + ) + else: + expected.index = MultiIndex.from_product( + [float_frame.index, ["absolute", "sqrt"]] + ) + result = float_frame.transform([np.abs, "sqrt"], axis=axis) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("use_apply", [True, False]) diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py index 351bd11f5aff7..6f206540a4d1b 100644 --- a/pandas/tests/series/apply/test_series_transform.py +++ b/pandas/tests/series/apply/test_series_transform.py @@ -8,37 +8,38 @@ def test_transform(string_series): - # transforming functions - f_sqrt = np.sqrt(string_series) - f_abs = np.abs(string_series) - - # ufunc - result = string_series.transform(np.sqrt) - expected = f_sqrt.copy() - tm.assert_series_equal(result, expected) - - # list-like - result = string_series.transform([np.sqrt]) - expected = f_sqrt.to_frame().copy() - expected.columns = ["sqrt"] - tm.assert_frame_equal(result, expected) - - result = string_series.transform(["sqrt"]) - tm.assert_frame_equal(result, expected) - - # multiple items in list - # these are in the order as if we are applying both functions per - # series and then concatting - result = string_series.transform(["sqrt", "abs"]) - expected = concat([f_sqrt, f_abs], axis=1) - expected.columns = ["sqrt", "abs"] - tm.assert_frame_equal(result, expected) - - # dict, provide renaming - expected = concat([f_sqrt, f_abs], axis=1) - expected.columns = ["foo", "bar"] - result = string_series.transform({"foo": np.sqrt, "bar": np.abs}) - tm.assert_frame_equal(result, expected) + with np.errstate(all="ignore"): + # transforming functions + f_sqrt = np.sqrt(string_series) + f_abs = np.abs(string_series) + + # ufunc + result = string_series.transform(np.sqrt) + expected = f_sqrt.copy() + tm.assert_series_equal(result, expected) + + # list-like + result = string_series.transform([np.sqrt]) + expected = f_sqrt.to_frame().copy() + expected.columns = ["sqrt"] + tm.assert_frame_equal(result, expected) + + result = string_series.transform(["sqrt"]) + tm.assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both functions per + # series and then concatting + result = string_series.transform(["sqrt", "abs"]) + expected = concat([f_sqrt, f_abs], axis=1) + expected.columns = ["sqrt", "abs"] + tm.assert_frame_equal(result, expected) + + # dict, provide renaming + expected = concat([f_sqrt, f_abs], axis=1) + expected.columns = ["foo", "bar"] + result = string_series.transform({"foo": np.sqrt, "bar": np.abs}) + tm.assert_frame_equal(result, expected) def test_transform_udf(axis, string_series): From a5d4a19a290c4173590eba2d54f4d6126c4d67f2 Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 5 Sep 2020 10:05:15 -0400 Subject: [PATCH 05/10] Added PR # to tests --- pandas/tests/frame/apply/test_frame_transform.py | 8 ++++++++ pandas/tests/series/apply/test_series_transform.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py index 4ae4384ee936e..0d7533df5c0bf 100644 --- a/pandas/tests/frame/apply/test_frame_transform.py +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -12,6 +12,7 @@ def test_transform(axis, float_frame): + # GH 35964 other_axis = 1 if axis in {0, "index"} else 0 with np.errstate(all="ignore"): @@ -50,6 +51,7 @@ def test_transform(axis, float_frame): @pytest.mark.parametrize("use_apply", [True, False]) def test_transform_udf(axis, float_frame, use_apply): + # GH 35964 # transform uses UDF either via apply or passing the entire DataFrame def func(x): # transform is using apply iff x is not a DataFrame @@ -73,6 +75,7 @@ def test_transform_method_name(method): def test_transform_and_agg_err(axis, float_frame): + # GH 35964 # cannot both transform and agg msg = "Function did not transform" with pytest.raises(ValueError, match=msg): @@ -93,6 +96,7 @@ def test_agg_dict_nested_renaming_depr(): def test_transform_reducer_raises(all_reductions): + # GH 35964 op = all_reductions df = DataFrame({"A": [1, 2, 3]}) msg = "Function did not transform" @@ -110,6 +114,7 @@ def test_transform_reducer_raises(all_reductions): # https://github.com/python/mypy/issues/5492 @pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1]) def test_transform_bad_dtype(op): + # GH 35964 df = DataFrame({"A": 3 * [object]}) # DataFrame that will fail on most transforms if op in ("backfill", "shift", "pad", "bfill", "ffill"): pytest.xfail("Transform function works on any datatype") @@ -126,6 +131,7 @@ def test_transform_bad_dtype(op): @pytest.mark.parametrize("op", transformation_kernels) def test_transform_multi_dtypes(op): + # GH 35964 df = DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]}) # Determine which columns op will work on @@ -153,6 +159,7 @@ def test_transform_multi_dtypes(op): @pytest.mark.parametrize("use_apply", [True, False]) def test_transform_passes_args(use_apply): + # GH 35964 # transform uses UDF either via apply or passing the entire DataFrame expected_args = [1, 2] expected_kwargs = {"c": 3} @@ -170,6 +177,7 @@ def f(x, a, b, c): def test_transform_missing_columns(axis): + # GH 35964 df = DataFrame({"A": [1, 2], "B": [3, 4]}) match = re.escape("Column(s) ['C'] do not exist") with pytest.raises(SpecificationError, match=match): diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py index 6f206540a4d1b..54d4cd0de0d6a 100644 --- a/pandas/tests/series/apply/test_series_transform.py +++ b/pandas/tests/series/apply/test_series_transform.py @@ -8,6 +8,7 @@ def test_transform(string_series): + # GH 35964 with np.errstate(all="ignore"): # transforming functions f_sqrt = np.sqrt(string_series) @@ -43,6 +44,7 @@ def test_transform(string_series): def test_transform_udf(axis, string_series): + # GH 35964 # via apply def func(x): if isinstance(x, Series): @@ -65,6 +67,7 @@ def func(x): def test_transform_wont_agg(string_series): + # GH 35964 # we are trying to transform with an aggregator msg = "Function did not transform" with pytest.raises(ValueError, match=msg): @@ -85,6 +88,7 @@ def test_transform_none_to_type(): def test_transform_reducer_raises(all_reductions): + # GH 35964 op = all_reductions s = Series([1, 2, 3]) msg = "Function did not transform" @@ -102,6 +106,7 @@ def test_transform_reducer_raises(all_reductions): # https://github.com/python/mypy/issues/5492 @pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1]) def test_transform_bad_dtype(op): + # GH 35964 s = Series(3 * [object]) # Series that will fail on most transforms if op in ("backfill", "shift", "pad", "bfill", "ffill"): pytest.xfail("Transform function works on any datatype") @@ -118,6 +123,7 @@ def test_transform_bad_dtype(op): @pytest.mark.parametrize("use_apply", [True, False]) def test_transform_passes_args(use_apply): + # GH 35964 # transform uses UDF either via apply or passing the entire Series expected_args = [1, 2] expected_kwargs = {"c": 3} @@ -135,12 +141,14 @@ def f(x, a, b, c): def test_transform_axis_1_raises(): + # GH 35964 msg = "No axis named 1 for object type Series" with pytest.raises(ValueError, match=msg): Series([1]).transform("sum", axis=1) def test_transform_nested_renamer(): + # GH 35964 match = "nested renamer is not supported" with pytest.raises(SpecificationError, match=match): Series([1]).transform({"A": {"B": ["sum"]}}) From 8454d91701ce69a8cb0b4852c073181f9f026852 Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Thu, 10 Sep 2020 00:58:31 -0400 Subject: [PATCH 06/10] Adjustments and cleanups --- pandas/core/aggregation.py | 12 +- pandas/core/frame.py | 11 +- pandas/core/generic.py | 70 --------- pandas/core/series.py | 12 +- pandas/core/shared_docs.py | 69 +++++++++ .../tests/frame/apply/test_frame_transform.py | 137 ++++++++---------- .../series/apply/test_series_transform.py | 68 +++------ 7 files changed, 164 insertions(+), 215 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index b1ec7f6f84ecc..8b74fe01d0dc0 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -18,7 +18,7 @@ Union, ) -from pandas._typing import AggFuncType, FrameOrSeries, Label +from pandas._typing import AggFuncType, Axis, FrameOrSeries, Label from pandas.core.dtypes.common import is_dict_like, is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -388,11 +388,7 @@ def validate_func_kwargs( def transform( - obj: FrameOrSeries, - func: Union[str, List, Dict, Callable], - axis: int, - *args, - **kwargs, + obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs, ) -> FrameOrSeries: """ Transform a DataFrame or Series @@ -420,6 +416,8 @@ def transform( ValueError If the transform function fails or does not transform. """ + from pandas.core.reshape.concat import concat + is_series = obj.ndim == 1 if obj._get_axis_number(axis) == 1: @@ -454,8 +452,6 @@ def transform( # combine results if len(results) == 0: raise ValueError("Transform function failed") - from pandas.core.reshape.concat import concat - return concat(results, axis=1) # func is either str or callable diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1f36c47642738..4fc91db133ef7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -45,6 +45,7 @@ from pandas._libs import algos as libalgos, lib, properties from pandas._libs.lib import no_default from pandas._typing import ( + AggFuncType, ArrayLike, Axes, Axis, @@ -7460,12 +7461,10 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): agg = aggregate - @doc( - NDFrame.transform, - klass=_shared_doc_kwargs["klass"], - axis=_shared_doc_kwargs["axis"], - ) - def transform(self, func, axis=0, *args, **kwargs) -> DataFrame: + @Appender(_shared_docs["transform"]) + def transform( + self, func: AggFuncType, axis: Axis = 0, *args, **kwargs + ) -> DataFrame: result = transform(self, func, axis, *args, **kwargs) assert isinstance(result, DataFrame) return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4320e43215b99..9ed9db801d0a8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10648,76 +10648,6 @@ def ewm( times=times, ) - @doc(klass=_shared_doc_kwargs["klass"], axis="") - def transform(self, func, *args, **kwargs): - """ - Call ``func`` on self producing a {klass} with transformed values. - - Produced {klass} will have same axis length as self. - - Parameters - ---------- - func : function, str, list or dict - Function to use for transforming the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.exp, 'sqrt']`` - - dict of axis labels -> functions, function names or list of such. - {axis} - *args - Positional arguments to pass to `func`. - **kwargs - Keyword arguments to pass to `func`. - - Returns - ------- - {klass} - A {klass} that must have the same length as self. - - Raises - ------ - ValueError : If the returned {klass} has a different length than self. - - See Also - -------- - {klass}.agg : Only perform aggregating type operations. - {klass}.apply : Invoke function on a {klass}. - - Examples - -------- - >>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}}) - >>> df - A B - 0 0 1 - 1 1 2 - 2 2 3 - >>> df.transform(lambda x: x + 1) - A B - 0 1 2 - 1 2 3 - 2 3 4 - - Even though the resulting {klass} must have the same length as the - input {klass}, it is possible to provide several input functions: - - >>> s = pd.Series(range(3)) - >>> s - 0 0 - 1 1 - 2 2 - dtype: int64 - >>> s.transform([np.sqrt, np.exp]) - sqrt exp - 0 0.000000 1.000000 - 1 1.000000 2.718282 - 2 1.414214 7.389056 - """ - raise NotImplementedError - # ---------------------------------------------------------------------- # Misc methods diff --git a/pandas/core/series.py b/pandas/core/series.py index a378733cc61af..f5151953efb98 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -25,6 +25,7 @@ from pandas._libs import lib, properties, reshape, tslibs from pandas._libs.lib import no_default from pandas._typing import ( + AggFuncType, ArrayLike, Axis, DtypeObj, @@ -89,6 +90,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer from pandas.core.internals import SingleBlockManager +from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ensure_key_mapped from pandas.core.strings import StringMethods from pandas.core.tools.datetimes import to_datetime @@ -4080,12 +4082,10 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): agg = aggregate - @doc( - NDFrame.transform, - klass=_shared_doc_kwargs["klass"], - axis=_shared_doc_kwargs["axis"], - ) - def transform(self, func, axis=0, *args, **kwargs): + @Appender(_shared_docs["transform"]) + def transform( + self, func: AggFuncType, axis: Axis = 0, *args, **kwargs + ) -> FrameOrSeriesUnion: from pandas.core.aggregation import transform return transform(self, func, axis, *args, **kwargs) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 0aaccb47efc44..244ee3aa298db 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -257,3 +257,72 @@ 1 b B E 3 2 c B E 5 """ + +_shared_docs[ + "transform" +] = """\ +Call ``func`` on self producing a {klass} with transformed values. + +Produced {klass} will have same axis length as self. + +Parameters +---------- +func : function, str, list or dict + Function to use for transforming the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.exp, 'sqrt']`` + - dict of axis labels -> functions, function names or list of such. +{axis} +*args + Positional arguments to pass to `func`. +**kwargs + Keyword arguments to pass to `func`. + +Returns +------- +{klass} + A {klass} that must have the same length as self. + +Raises +------ +ValueError : If the returned {klass} has a different length than self. + +See Also +-------- +{klass}.agg : Only perform aggregating type operations. +{klass}.apply : Invoke function on a {klass}. + +Examples +-------- +>>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}}) +>>> df + A B +0 0 1 +1 1 2 +2 2 3 +>>> df.transform(lambda x: x + 1) + A B +0 1 2 +1 2 3 +2 3 4 + +Even though the resulting {klass} must have the same length as the +input {klass}, it is possible to provide several input functions: + +>>> s = pd.Series(range(3)) +>>> s +0 0 +1 1 +2 2 +dtype: int64 +>>> s.transform([np.sqrt, np.exp]) + sqrt exp +0 0.000000 1.000000 +1 1.000000 2.718282 +2 1.414214 7.389056 +""" diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py index 7ca9c3f66ca13..ffd1fb0c7492c 100644 --- a/pandas/tests/frame/apply/test_frame_transform.py +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -4,16 +4,6 @@ import numpy as np import pytest -import pandas as pd -import pandas._testing as tm -from pandas.tests.frame.common import zip_frames - - -def test_agg_transform(axis, float_frame): - other_axis = 1 if axis in {0, "index"} else 0 - - with np.errstate(all="ignore"): - from pandas import DataFrame, MultiIndex import pandas._testing as tm from pandas.core.base import SpecificationError @@ -21,59 +11,57 @@ def test_agg_transform(axis, float_frame): from pandas.tests.frame.common import zip_frames -def test_transform(axis, float_frame): +def test_transform_ufunc(axis, float_frame): # GH 35964 - other_axis = 1 if axis in {0, "index"} else 0 - with np.errstate(all="ignore"): - f_abs = np.abs(float_frame) f_sqrt = np.sqrt(float_frame) + result = float_frame.transform(np.sqrt, axis=axis) + expected = f_sqrt + tm.assert_frame_equal(result, expected) - # ufunc - result = float_frame.transform(np.sqrt, axis=axis) - expected = f_sqrt.copy() - tm.assert_frame_equal(result, expected) - - result = float_frame.transform(np.sqrt, axis=axis) - tm.assert_frame_equal(result, expected) - - # list-like - expected = f_sqrt.copy() - if axis in {0, "index"}: - expected.columns = pd.MultiIndex.from_product( - [float_frame.columns, ["sqrt"]] - ) - else: - expected.index = pd.MultiIndex.from_product([float_frame.index, ["sqrt"]]) - result = float_frame.transform([np.sqrt], axis=axis) - tm.assert_frame_equal(result, expected) - - # multiple items in list - # these are in the order as if we are applying both - # functions per series and then concatting - expected = zip_frames([f_abs, f_sqrt], axis=other_axis) - if axis in {0, "index"}: - expected.columns = pd.MultiIndex.from_product( - [float_frame.columns, ["absolute", "sqrt"]] - ) - else: - expected.index = pd.MultiIndex.from_product( - [float_frame.index, ["absolute", "sqrt"]] - ) - result = float_frame.transform([np.abs, "sqrt"], axis=axis) - tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "ops, names", [([np.sqrt], ["sqrt"]), ([np.abs, np.sqrt], ["absolute", "sqrt"])] +) +def test_transform_list(axis, float_frame, ops, names): + # GH 35964 + other_axis = 1 if axis in {0, "index"} else 0 + with np.errstate(all="ignore"): + expected = zip_frames([op(float_frame) for op in ops], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product([float_frame.columns, names]) + else: + expected.index = MultiIndex.from_product([float_frame.index, names]) + result = float_frame.transform(ops, axis=axis) + tm.assert_frame_equal(result, expected) -def test_transform_and_agg_err(axis, float_frame): - # cannot both transform and agg - msg = "transforms cannot produce aggregated results" - with pytest.raises(ValueError, match=msg): - float_frame.transform(["max", "min"], axis=axis) - msg = "cannot combine transform and aggregation operations" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - float_frame.transform(["max", "sqrt"], axis=axis) +def test_transform_dict(axis, float_frame): + # GH 35964 + if axis == 0 or axis == "index": + e = float_frame.columns[0] + expected = float_frame[[e]].transform(np.abs) + else: + e = float_frame.index[0] + expected = float_frame.iloc[[0]].transform(np.abs) + result = float_frame.transform({e: np.abs}, axis=axis) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("use_apply", [True, False]) +def test_transform_udf(axis, float_frame, use_apply): + # GH 35964 + # transform uses UDF either via apply or passing the entire DataFrame + def func(x): + # transform is using apply iff x is not a DataFrame + if use_apply == isinstance(x, DataFrame): + # Force transform to fallback + raise ValueError + return x + 1 + + result = float_frame.transform(func, axis=axis) + expected = float_frame + 1 + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) @@ -103,7 +91,8 @@ def test_agg_dict_nested_renaming_depr(): # nested renaming msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): - df.transform({"A": {"foo": "min"}, "B": {"bar": "max"}}) + # mypy identifies the argument as an invalid type + df.transform({"A": {"foo": "min"}, "B": {"bar": "max"}}) # type: ignore def test_transform_reducer_raises(all_reductions): @@ -140,32 +129,22 @@ def test_transform_bad_dtype(op): df.transform({"A": [op]}) -@pytest.mark.parametrize("op", transformation_kernels) +@pytest.mark.parametrize("op", ["diff", "pct_change", "cumprod"]) def test_transform_multi_dtypes(op): # GH 35964 df = DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]}) - # Determine which columns op will work on - columns = [] - for column in df: - try: - df[column].transform(op) - columns.append(column) - except Exception: - pass - - if len(columns) > 0: - expected = df[columns].transform([op]) - result = df.transform([op]) - tm.assert_equal(result, expected) - - expected = df[columns].transform({column: op for column in columns}) - result = df.transform({column: op for column in columns}) - tm.assert_equal(result, expected) - - expected = df[columns].transform({column: [op] for column in columns}) - result = df.transform({column: [op] for column in columns}) - tm.assert_equal(result, expected) + expected = df[["B"]].transform([op]) + result = df.transform([op]) + tm.assert_equal(result, expected) + + expected = df[["B"]].transform({"B": op}) + result = df.transform({"B": op}) + tm.assert_equal(result, expected) + + expected = df[["B"]].transform({"B": [op]}) + result = df.transform({"B": [op]}) + tm.assert_equal(result, expected) @pytest.mark.parametrize("use_apply", [True, False]) diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py index 19329446a633a..cd19a61956ef8 100644 --- a/pandas/tests/series/apply/test_series_transform.py +++ b/pandas/tests/series/apply/test_series_transform.py @@ -7,43 +7,37 @@ from pandas.core.groupby.base import transformation_kernels -def test_transform(string_series): - # transforming functions - +def test_transform_ufunc(string_series): + # GH 35964 with np.errstate(all="ignore"): f_sqrt = np.sqrt(string_series) f_abs = np.abs(string_series) - # ufunc - result = string_series.transform(np.sqrt) - expected = f_sqrt.copy() - tm.assert_series_equal(result, expected) - - # list-like - result = string_series.transform([np.sqrt]) - expected = f_sqrt.to_frame().copy() - expected.columns = ["sqrt"] - tm.assert_frame_equal(result, expected) + # ufunc + result = string_series.transform(np.sqrt) + expected = f_sqrt.copy() + tm.assert_series_equal(result, expected) - result = string_series.transform([np.sqrt]) - tm.assert_frame_equal(result, expected) - result = string_series.transform(["sqrt"]) +@pytest.mark.parametrize( + "ops, names", [([np.sqrt], ["sqrt"]), ([np.abs, np.sqrt], ["absolute", "sqrt"])] +) +def test_transform_list(string_series, ops, names): + # GH 35964 + with np.errstate(all="ignore"): + expected = concat([op(string_series) for op in ops], axis=1) + expected.columns = names + result = string_series.transform(ops) tm.assert_frame_equal(result, expected) - # multiple items in list - # these are in the order as if we are applying both functions per - # series and then concatting - result = string_series.transform(["sqrt", "abs"]) - expected = concat([f_sqrt, f_abs], axis=1) - expected.columns = ["sqrt", "abs"] - tm.assert_frame_equal(result, expected) - # dict, provide renaming - expected = concat([f_sqrt, f_abs], axis=1) - expected.columns = ["foo", "bar"] - result = string_series.transform({"foo": np.sqrt, "bar": np.abs}) - tm.assert_frame_equal(result, expected) +def test_transform_dict(string_series): + # GH 35964 + with np.errstate(all="ignore"): + expected = concat([np.sqrt(string_series), np.abs(string_series)], axis=1) + expected.columns = ["foo", "bar"] + result = string_series.transform({"foo": np.sqrt, "bar": np.abs}) + tm.assert_frame_equal(result, expected) def test_transform_udf(axis, string_series): @@ -77,19 +71,6 @@ def test_transform_wont_agg(string_series): string_series.transform(["min", "max"]) msg = "Function did not transform" - expected = pd.concat([f_sqrt, f_abs], axis=1) - result = string_series.transform(["sqrt", "abs"]) - expected.columns = ["sqrt", "abs"] - tm.assert_frame_equal(result, expected) - - -def test_transform_and_agg_error(string_series): - # we are trying to transform with an aggregator - msg = "transforms cannot produce aggregated results" - with pytest.raises(ValueError, match=msg): - string_series.transform(["min", "max"]) - - msg = "cannot combine transform and aggregation operations" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): string_series.transform(["sqrt", "max"]) @@ -97,11 +78,6 @@ def test_transform_and_agg_error(string_series): def test_transform_none_to_type(): # GH34377 - df = pd.DataFrame({"a": [None]}) - - msg = "DataFrame constructor called with incompatible data and dtype" - with pytest.raises(TypeError, match=msg): - df.transform({"a": int}) df = DataFrame({"a": [None]}) msg = "Transform function failed" with pytest.raises(ValueError, match=msg): From 9eee0cbea0468a942f4a4c1b0dbca368ca41c7e5 Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 12 Sep 2020 09:00:57 -0400 Subject: [PATCH 07/10] Docs and linting fixes --- pandas/core/frame.py | 6 +++++- pandas/core/series.py | 6 +++++- pandas/tests/frame/apply/test_frame_transform.py | 2 +- pandas/tests/series/apply/test_series_transform.py | 1 - 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a07a94d94948a..1e5360f39a75e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7461,7 +7461,11 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): agg = aggregate - @Appender(_shared_docs["transform"]) + @doc( + _shared_docs["transform"], + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], + ) def transform( self, func: AggFuncType, axis: Axis = 0, *args, **kwargs ) -> DataFrame: diff --git a/pandas/core/series.py b/pandas/core/series.py index f5151953efb98..632b93cdcf24b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4082,7 +4082,11 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): agg = aggregate - @Appender(_shared_docs["transform"]) + @doc( + _shared_docs["transform"], + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], + ) def transform( self, func: AggFuncType, axis: Axis = 0, *args, **kwargs ) -> FrameOrSeriesUnion: diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py index ffd1fb0c7492c..74bf5465c3c94 100644 --- a/pandas/tests/frame/apply/test_frame_transform.py +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -92,7 +92,7 @@ def test_agg_dict_nested_renaming_depr(): msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): # mypy identifies the argument as an invalid type - df.transform({"A": {"foo": "min"}, "B": {"bar": "max"}}) # type: ignore + df.transform({"A": {"foo": "min"}, "B": {"bar": "max"}}) def test_transform_reducer_raises(all_reductions): diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py index cd19a61956ef8..6fdd6e73142a4 100644 --- a/pandas/tests/series/apply/test_series_transform.py +++ b/pandas/tests/series/apply/test_series_transform.py @@ -11,7 +11,6 @@ def test_transform_ufunc(string_series): # GH 35964 with np.errstate(all="ignore"): f_sqrt = np.sqrt(string_series) - f_abs = np.abs(string_series) # ufunc result = string_series.transform(np.sqrt) From cf4f80b6a4a9f56cfe80a95a9e2b569f7ec9923f Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 12 Sep 2020 11:24:10 -0400 Subject: [PATCH 08/10] Added and improved some tests --- .../tests/frame/apply/test_frame_transform.py | 35 +++++++++++++++++-- .../series/apply/test_series_transform.py | 15 ++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py index 74bf5465c3c94..346e60954fc13 100644 --- a/pandas/tests/frame/apply/test_frame_transform.py +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -20,6 +20,26 @@ def test_transform_ufunc(axis, float_frame): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("op", transformation_kernels) +def test_transform_groupby_kernel(axis, float_frame, op): + # GH 35964 + if op == "cumcount": + pytest.xfail("DataFrame.cumcount does not exist") + if op == "tshift": + pytest.xfail("Only works on time index and is deprecated") + if axis == 1 or axis == "columns": + pytest.xfail("GH 36308: groupby.transform with axis=1 is broken") + + args = [0.0] if op == "fillna" else [] + if axis == 0 or axis == "index": + ones = np.ones(float_frame.shape[0]) + else: + ones = np.ones(float_frame.shape[1]) + expected = float_frame.groupby(ones, axis=axis).transform(op, *args) + result = float_frame.transform(op, axis, *args) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "ops, names", [([np.sqrt], ["sqrt"]), ([np.abs, np.sqrt], ["absolute", "sqrt"])] ) @@ -129,10 +149,19 @@ def test_transform_bad_dtype(op): df.transform({"A": [op]}) -@pytest.mark.parametrize("op", ["diff", "pct_change", "cumprod"]) -def test_transform_multi_dtypes(op): +@pytest.mark.parametrize("op", transformation_kernels) +def test_transform_partial_failure(op): # GH 35964 - df = DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]}) + wont_fail = ["ffill", "bfill", "fillna", "pad", "backfill", "shift"] + if op in wont_fail: + pytest.xfail("Transform kernel is successful on all dtypes") + if op == "cumcount": + pytest.xfail("transform('cumcount') not implemented") + if op == "tshift": + pytest.xfail("Only works on time index; deprecated") + + # Using object makes most transform kernels fail + df = DataFrame({"A": 3 * [object], "B": [1, 2, 3]}) expected = df[["B"]].transform([op]) result = df.transform([op]) diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py index 6fdd6e73142a4..0842674da2a7d 100644 --- a/pandas/tests/series/apply/test_series_transform.py +++ b/pandas/tests/series/apply/test_series_transform.py @@ -18,6 +18,21 @@ def test_transform_ufunc(string_series): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("op", transformation_kernels) +def test_transform_groupby_kernel(string_series, op): + # GH 35964 + if op == "cumcount": + pytest.xfail("Series.cumcount does not exist") + if op == "tshift": + pytest.xfail("Only works on time index and is deprecated") + + args = [0.0] if op == "fillna" else [] + ones = np.ones(string_series.shape[0]) + expected = string_series.groupby(ones).transform(op, *args) + result = string_series.transform(op, 0, *args) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "ops, names", [([np.sqrt], ["sqrt"]), ([np.abs, np.sqrt], ["absolute", "sqrt"])] ) From 69e6807a9ca339d0fb9b0cb0088ebbdb47b34816 Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 12 Sep 2020 11:30:35 -0400 Subject: [PATCH 09/10] whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index bce6a735b7b07..a9109e641d165 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -342,7 +342,7 @@ Other ^^^^^ - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) -- +- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`) .. --------------------------------------------------------------------------- From f66a80624812165ffaf53585f7d98c527026d4e2 Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 12 Sep 2020 11:32:08 -0400 Subject: [PATCH 10/10] whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index a9109e641d165..8864469eaf858 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -343,6 +343,7 @@ Other - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) - Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`) +- .. ---------------------------------------------------------------------------