From d9b5d0b22857d7a60d491c90228c649037a98f33 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 2 May 2021 16:57:19 -0700 Subject: [PATCH] ENH: Numba engine for EWM.mean (#41267) --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/window/ewm.py | 85 +++++++++++-------------------- pandas/core/window/numba_.py | 16 +++--- pandas/tests/window/test_numba.py | 40 ++++++++++----- 4 files changed, 66 insertions(+), 77 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ba3bc0c8c8842..658e68d1465e9 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -197,7 +197,7 @@ Other enhancements - Add support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`) - :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`) - :class:`pandas.ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`) -- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`) +- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.applymap` can now accept kwargs to pass on to func (:issue:`39987`) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 4a210d8b47e9b..08a65964f278e 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -29,16 +29,18 @@ args_compat, create_section_header, kwargs_compat, + numba_notes, template_header, template_returns, template_see_also, + window_agg_numba_parameters, ) from pandas.core.window.indexers import ( BaseIndexer, ExponentialMovingWindowIndexer, GroupbyIndexer, ) -from pandas.core.window.numba_ import generate_numba_groupby_ewma_func +from pandas.core.window.numba_ import generate_numba_ewma_func from pandas.core.window.rolling import ( BaseWindow, BaseWindowGroupby, @@ -372,26 +374,41 @@ def aggregate(self, func, *args, **kwargs): template_header, create_section_header("Parameters"), args_compat, + window_agg_numba_parameters, kwargs_compat, create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Notes"), + numba_notes.replace("\n", "", 1), window_method="ewm", aggregation_description="(exponential weighted moment) mean", agg_method="mean", ) - def mean(self, *args, **kwargs): - nv.validate_window_func("mean", args, kwargs) - window_func = window_aggregations.ewma - window_func = partial( - window_func, - com=self._com, - adjust=self.adjust, - ignore_na=self.ignore_na, - deltas=self._deltas, - ) - return self._apply(window_func) + def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): + if maybe_use_numba(engine): + ewma_func = generate_numba_ewma_func( + engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas + ) + return self._apply( + ewma_func, + numba_cache_key=(lambda x: x, "ewma"), + ) + elif engine in ("cython", None): + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + nv.validate_window_func("mean", args, kwargs) + window_func = partial( + window_aggregations.ewma, + com=self._com, + adjust=self.adjust, + ignore_na=self.ignore_na, + deltas=self._deltas, + ) + return self._apply(window_func) + else: + raise ValueError("engine must be either 'numba' or 'cython'") @doc( template_header, @@ -635,45 +652,3 @@ def _get_window_indexer(self) -> GroupbyIndexer: window_indexer=ExponentialMovingWindowIndexer, ) return window_indexer - - def mean(self, engine=None, engine_kwargs=None): - """ - Parameters - ---------- - engine : str, default None - * ``'cython'`` : Runs mean through C-extensions from cython. - * ``'numba'`` : Runs mean through JIT compiled code from numba. - Only available when ``raw`` is set to ``True``. - * ``None`` : Defaults to ``'cython'`` or globally setting - ``compute.use_numba`` - - .. versionadded:: 1.2.0 - - engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}``. - - .. versionadded:: 1.2.0 - - Returns - ------- - Series or DataFrame - Return type is determined by the caller. - """ - if maybe_use_numba(engine): - groupby_ewma_func = generate_numba_groupby_ewma_func( - engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas - ) - return self._apply( - groupby_ewma_func, - numba_cache_key=(lambda x: x, "groupby_ewma"), - ) - elif engine in ("cython", None): - if engine_kwargs is not None: - raise ValueError("cython engine does not accept engine_kwargs") - return super().mean() - else: - raise ValueError("engine must be either 'numba' or 'cython'") diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index d84dea7ee622c..9407efd0bef2b 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -80,7 +80,7 @@ def roll_apply( return roll_apply -def generate_numba_groupby_ewma_func( +def generate_numba_ewma_func( engine_kwargs: Optional[Dict[str, bool]], com: float, adjust: bool, @@ -88,7 +88,7 @@ def generate_numba_groupby_ewma_func( deltas: np.ndarray, ): """ - Generate a numba jitted groupby ewma function specified by values + Generate a numba jitted ewma function specified by values from engine_kwargs. Parameters @@ -106,14 +106,14 @@ def generate_numba_groupby_ewma_func( """ nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - cache_key = (lambda x: x, "groupby_ewma") + cache_key = (lambda x: x, "ewma") if cache_key in NUMBA_FUNC_CACHE: return NUMBA_FUNC_CACHE[cache_key] numba = import_optional_dependency("numba") @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def groupby_ewma( + def ewma( values: np.ndarray, begin: np.ndarray, end: np.ndarray, @@ -121,15 +121,15 @@ def groupby_ewma( ) -> np.ndarray: result = np.empty(len(values)) alpha = 1.0 / (1.0 + com) + old_wt_factor = 1.0 - alpha + new_wt = 1.0 if adjust else alpha + for i in numba.prange(len(begin)): start = begin[i] stop = end[i] window = values[start:stop] sub_result = np.empty(len(window)) - old_wt_factor = 1.0 - alpha - new_wt = 1.0 if adjust else alpha - weighted_avg = window[0] nobs = int(not np.isnan(weighted_avg)) sub_result[0] = weighted_avg if nobs >= minimum_periods else np.nan @@ -166,7 +166,7 @@ def groupby_ewma( return result - return groupby_ewma + return ewma def generate_numba_table_func( diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 06b34201e0dba..b79c367d482ae 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -123,30 +123,44 @@ def func_2(x): @td.skip_if_no("numba", "0.46.0") -class TestGroupbyEWMMean: - def test_invalid_engine(self): +class TestEWMMean: + @pytest.mark.parametrize( + "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] + ) + def test_invalid_engine(self, grouper): df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) with pytest.raises(ValueError, match="engine must be either"): - df.groupby("A").ewm(com=1.0).mean(engine="foo") + grouper(df).ewm(com=1.0).mean(engine="foo") - def test_invalid_engine_kwargs(self): + @pytest.mark.parametrize( + "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] + ) + def test_invalid_engine_kwargs(self, grouper): df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) with pytest.raises(ValueError, match="cython engine does not"): - df.groupby("A").ewm(com=1.0).mean( + grouper(df).ewm(com=1.0).mean( engine="cython", engine_kwargs={"nopython": True} ) - def test_cython_vs_numba(self, nogil, parallel, nopython, ignore_na, adjust): + @pytest.mark.parametrize( + "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] + ) + def test_cython_vs_numba( + self, grouper, nogil, parallel, nopython, ignore_na, adjust + ): df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) - gb_ewm = df.groupby("A").ewm(com=1.0, adjust=adjust, ignore_na=ignore_na) + ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = gb_ewm.mean(engine="numba", engine_kwargs=engine_kwargs) - expected = gb_ewm.mean(engine="cython") + result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) + expected = ewm.mean(engine="cython") tm.assert_frame_equal(result, expected) - def test_cython_vs_numba_times(self, nogil, parallel, nopython, ignore_na): + @pytest.mark.parametrize( + "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] + ) + def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na): # GH 40951 halflife = "23 days" times = to_datetime( @@ -160,13 +174,13 @@ def test_cython_vs_numba_times(self, nogil, parallel, nopython, ignore_na): ] ) df = DataFrame({"A": ["a", "b", "a", "b", "b", "a"], "B": [0, 0, 1, 1, 2, 2]}) - gb_ewm = df.groupby("A").ewm( + ewm = grouper(df).ewm( halflife=halflife, adjust=True, ignore_na=ignore_na, times=times ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = gb_ewm.mean(engine="numba", engine_kwargs=engine_kwargs) - expected = gb_ewm.mean(engine="cython") + result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) + expected = ewm.mean(engine="cython") tm.assert_frame_equal(result, expected)