ENH: Numba engine for EWM.mean (pandas-dev#41267)

yeshsurya · May 6, 2021 · d9b5d0b · d9b5d0b
1 parent c4e3339
commit d9b5d0b
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 77 deletions.
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -197,7 +197,7 @@ Other enhancements
 - Add support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`)
 - :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`)
 - :class:`pandas.ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`)
-- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`)
+- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`)
 - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`)
 - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`)
 - :meth:`DataFrame.applymap` can now accept kwargs to pass on to func (:issue:`39987`)

diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py
@@ -29,16 +29,18 @@
     args_compat,
     create_section_header,
     kwargs_compat,
+    numba_notes,
     template_header,
     template_returns,
     template_see_also,
+    window_agg_numba_parameters,
 )
 from pandas.core.window.indexers import (
     BaseIndexer,
     ExponentialMovingWindowIndexer,
     GroupbyIndexer,
 )
-from pandas.core.window.numba_ import generate_numba_groupby_ewma_func
+from pandas.core.window.numba_ import generate_numba_ewma_func
 from pandas.core.window.rolling import (
     BaseWindow,
     BaseWindowGroupby,
@@ -372,26 +374,41 @@ def aggregate(self, func, *args, **kwargs):
         template_header,
         create_section_header("Parameters"),
         args_compat,
+        window_agg_numba_parameters,
         kwargs_compat,
         create_section_header("Returns"),
         template_returns,
         create_section_header("See Also"),
-        template_see_also[:-1],
+        template_see_also,
+        create_section_header("Notes"),
+        numba_notes.replace("\n", "", 1),
         window_method="ewm",
         aggregation_description="(exponential weighted moment) mean",
         agg_method="mean",
     )
-    def mean(self, *args, **kwargs):
-        nv.validate_window_func("mean", args, kwargs)
-        window_func = window_aggregations.ewma
-        window_func = partial(
-            window_func,
-            com=self._com,
-            adjust=self.adjust,
-            ignore_na=self.ignore_na,
-            deltas=self._deltas,
-        )
-        return self._apply(window_func)
+    def mean(self, *args, engine=None, engine_kwargs=None, **kwargs):
+        if maybe_use_numba(engine):
+            ewma_func = generate_numba_ewma_func(
+                engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas
+            )
+            return self._apply(
+                ewma_func,
+                numba_cache_key=(lambda x: x, "ewma"),
+            )
+        elif engine in ("cython", None):
+            if engine_kwargs is not None:
+                raise ValueError("cython engine does not accept engine_kwargs")
+            nv.validate_window_func("mean", args, kwargs)
+            window_func = partial(
+                window_aggregations.ewma,
+                com=self._com,
+                adjust=self.adjust,
+                ignore_na=self.ignore_na,
+                deltas=self._deltas,
+            )
+            return self._apply(window_func)
+        else:
+            raise ValueError("engine must be either 'numba' or 'cython'")
 
     @doc(
         template_header,
@@ -635,45 +652,3 @@ def _get_window_indexer(self) -> GroupbyIndexer:
             window_indexer=ExponentialMovingWindowIndexer,
         )
         return window_indexer
-
-    def mean(self, engine=None, engine_kwargs=None):
-        """
-        Parameters
-        ----------
-        engine : str, default None
-            * ``'cython'`` : Runs mean through C-extensions from cython.
-            * ``'numba'`` : Runs mean through JIT compiled code from numba.
-              Only available when ``raw`` is set to ``True``.
-            * ``None`` : Defaults to ``'cython'`` or globally setting
-              ``compute.use_numba``
-
-              .. versionadded:: 1.2.0
-
-        engine_kwargs : dict, default None
-            * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
-            * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
-              and ``parallel`` dictionary keys. The values must either be ``True`` or
-              ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
-              ``{'nopython': True, 'nogil': False, 'parallel': False}``.
-
-              .. versionadded:: 1.2.0
-
-        Returns
-        -------
-        Series or DataFrame
-            Return type is determined by the caller.
-        """
-        if maybe_use_numba(engine):
-            groupby_ewma_func = generate_numba_groupby_ewma_func(
-                engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas
-            )
-            return self._apply(
-                groupby_ewma_func,
-                numba_cache_key=(lambda x: x, "groupby_ewma"),
-            )
-        elif engine in ("cython", None):
-            if engine_kwargs is not None:
-                raise ValueError("cython engine does not accept engine_kwargs")
-            return super().mean()
-        else:
-            raise ValueError("engine must be either 'numba' or 'cython'")
diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py
@@ -80,15 +80,15 @@ def roll_apply(
     return roll_apply
 
 
-def generate_numba_groupby_ewma_func(
+def generate_numba_ewma_func(
     engine_kwargs: Optional[Dict[str, bool]],
     com: float,
     adjust: bool,
     ignore_na: bool,
     deltas: np.ndarray,
 ):
     """
-    Generate a numba jitted groupby ewma function specified by values
+    Generate a numba jitted ewma function specified by values
     from engine_kwargs.
 
     Parameters
@@ -106,30 +106,30 @@ def generate_numba_groupby_ewma_func(
     """
     nopython, nogil, parallel = get_jit_arguments(engine_kwargs)
 
-    cache_key = (lambda x: x, "groupby_ewma")
+    cache_key = (lambda x: x, "ewma")
     if cache_key in NUMBA_FUNC_CACHE:
         return NUMBA_FUNC_CACHE[cache_key]
 
     numba = import_optional_dependency("numba")
 
     @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
-    def groupby_ewma(
+    def ewma(
         values: np.ndarray,
         begin: np.ndarray,
         end: np.ndarray,
         minimum_periods: int,
     ) -> np.ndarray:
         result = np.empty(len(values))
         alpha = 1.0 / (1.0 + com)
+        old_wt_factor = 1.0 - alpha
+        new_wt = 1.0 if adjust else alpha
+
         for i in numba.prange(len(begin)):
             start = begin[i]
             stop = end[i]
             window = values[start:stop]
             sub_result = np.empty(len(window))
 
-            old_wt_factor = 1.0 - alpha
-            new_wt = 1.0 if adjust else alpha
-
             weighted_avg = window[0]
             nobs = int(not np.isnan(weighted_avg))
             sub_result[0] = weighted_avg if nobs >= minimum_periods else np.nan
@@ -166,7 +166,7 @@ def groupby_ewma(
 
         return result
 
-    return groupby_ewma
+    return ewma
 
 
 def generate_numba_table_func(

diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py
@@ -123,30 +123,44 @@ def func_2(x):
 
 
 @td.skip_if_no("numba", "0.46.0")
-class TestGroupbyEWMMean:
-    def test_invalid_engine(self):
+class TestEWMMean:
+    @pytest.mark.parametrize(
+        "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"]
+    )
+    def test_invalid_engine(self, grouper):
         df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)})
         with pytest.raises(ValueError, match="engine must be either"):
-            df.groupby("A").ewm(com=1.0).mean(engine="foo")
+            grouper(df).ewm(com=1.0).mean(engine="foo")
 
-    def test_invalid_engine_kwargs(self):
+    @pytest.mark.parametrize(
+        "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"]
+    )
+    def test_invalid_engine_kwargs(self, grouper):
         df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)})
         with pytest.raises(ValueError, match="cython engine does not"):
-            df.groupby("A").ewm(com=1.0).mean(
+            grouper(df).ewm(com=1.0).mean(
                 engine="cython", engine_kwargs={"nopython": True}
             )
 
-    def test_cython_vs_numba(self, nogil, parallel, nopython, ignore_na, adjust):
+    @pytest.mark.parametrize(
+        "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"]
+    )
+    def test_cython_vs_numba(
+        self, grouper, nogil, parallel, nopython, ignore_na, adjust
+    ):
         df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)})
-        gb_ewm = df.groupby("A").ewm(com=1.0, adjust=adjust, ignore_na=ignore_na)
+        ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na)
 
         engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
-        result = gb_ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
-        expected = gb_ewm.mean(engine="cython")
+        result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
+        expected = ewm.mean(engine="cython")
 
         tm.assert_frame_equal(result, expected)
 
-    def test_cython_vs_numba_times(self, nogil, parallel, nopython, ignore_na):
+    @pytest.mark.parametrize(
+        "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"]
+    )
+    def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na):
         # GH 40951
         halflife = "23 days"
         times = to_datetime(
@@ -160,13 +174,13 @@ def test_cython_vs_numba_times(self, nogil, parallel, nopython, ignore_na):
             ]
         )
         df = DataFrame({"A": ["a", "b", "a", "b", "b", "a"], "B": [0, 0, 1, 1, 2, 2]})
-        gb_ewm = df.groupby("A").ewm(
+        ewm = grouper(df).ewm(
             halflife=halflife, adjust=True, ignore_na=ignore_na, times=times
         )
 
         engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
-        result = gb_ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
-        expected = gb_ewm.mean(engine="cython")
+        result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
+        expected = ewm.mean(engine="cython")
 
         tm.assert_frame_equal(result, expected)