Skip to content

Commit

Permalink
ENH: Numba engine for EWM.mean (pandas-dev#41267)
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored and yeshsurya committed May 6, 2021
1 parent c4e3339 commit d9b5d0b
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 77 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ Other enhancements
- Add support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`)
- :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`)
- :class:`pandas.ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`)
- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`)
- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`)
- :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`)
- :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`)
- :meth:`DataFrame.applymap` can now accept kwargs to pass on to func (:issue:`39987`)
Expand Down
85 changes: 30 additions & 55 deletions pandas/core/window/ewm.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,18 @@
args_compat,
create_section_header,
kwargs_compat,
numba_notes,
template_header,
template_returns,
template_see_also,
window_agg_numba_parameters,
)
from pandas.core.window.indexers import (
BaseIndexer,
ExponentialMovingWindowIndexer,
GroupbyIndexer,
)
from pandas.core.window.numba_ import generate_numba_groupby_ewma_func
from pandas.core.window.numba_ import generate_numba_ewma_func
from pandas.core.window.rolling import (
BaseWindow,
BaseWindowGroupby,
Expand Down Expand Up @@ -372,26 +374,41 @@ def aggregate(self, func, *args, **kwargs):
template_header,
create_section_header("Parameters"),
args_compat,
window_agg_numba_parameters,
kwargs_compat,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also[:-1],
template_see_also,
create_section_header("Notes"),
numba_notes.replace("\n", "", 1),
window_method="ewm",
aggregation_description="(exponential weighted moment) mean",
agg_method="mean",
)
def mean(self, *args, **kwargs):
nv.validate_window_func("mean", args, kwargs)
window_func = window_aggregations.ewma
window_func = partial(
window_func,
com=self._com,
adjust=self.adjust,
ignore_na=self.ignore_na,
deltas=self._deltas,
)
return self._apply(window_func)
def mean(self, *args, engine=None, engine_kwargs=None, **kwargs):
if maybe_use_numba(engine):
ewma_func = generate_numba_ewma_func(
engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas
)
return self._apply(
ewma_func,
numba_cache_key=(lambda x: x, "ewma"),
)
elif engine in ("cython", None):
if engine_kwargs is not None:
raise ValueError("cython engine does not accept engine_kwargs")
nv.validate_window_func("mean", args, kwargs)
window_func = partial(
window_aggregations.ewma,
com=self._com,
adjust=self.adjust,
ignore_na=self.ignore_na,
deltas=self._deltas,
)
return self._apply(window_func)
else:
raise ValueError("engine must be either 'numba' or 'cython'")

@doc(
template_header,
Expand Down Expand Up @@ -635,45 +652,3 @@ def _get_window_indexer(self) -> GroupbyIndexer:
window_indexer=ExponentialMovingWindowIndexer,
)
return window_indexer

def mean(self, engine=None, engine_kwargs=None):
"""
Parameters
----------
engine : str, default None
* ``'cython'`` : Runs mean through C-extensions from cython.
* ``'numba'`` : Runs mean through JIT compiled code from numba.
Only available when ``raw`` is set to ``True``.
* ``None`` : Defaults to ``'cython'`` or globally setting
``compute.use_numba``
.. versionadded:: 1.2.0
engine_kwargs : dict, default None
* For ``'cython'`` engine, there are no accepted ``engine_kwargs``
* For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
and ``parallel`` dictionary keys. The values must either be ``True`` or
``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
``{'nopython': True, 'nogil': False, 'parallel': False}``.
.. versionadded:: 1.2.0
Returns
-------
Series or DataFrame
Return type is determined by the caller.
"""
if maybe_use_numba(engine):
groupby_ewma_func = generate_numba_groupby_ewma_func(
engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas
)
return self._apply(
groupby_ewma_func,
numba_cache_key=(lambda x: x, "groupby_ewma"),
)
elif engine in ("cython", None):
if engine_kwargs is not None:
raise ValueError("cython engine does not accept engine_kwargs")
return super().mean()
else:
raise ValueError("engine must be either 'numba' or 'cython'")
16 changes: 8 additions & 8 deletions pandas/core/window/numba_.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,15 @@ def roll_apply(
return roll_apply


def generate_numba_groupby_ewma_func(
def generate_numba_ewma_func(
engine_kwargs: Optional[Dict[str, bool]],
com: float,
adjust: bool,
ignore_na: bool,
deltas: np.ndarray,
):
"""
Generate a numba jitted groupby ewma function specified by values
Generate a numba jitted ewma function specified by values
from engine_kwargs.
Parameters
Expand All @@ -106,30 +106,30 @@ def generate_numba_groupby_ewma_func(
"""
nopython, nogil, parallel = get_jit_arguments(engine_kwargs)

cache_key = (lambda x: x, "groupby_ewma")
cache_key = (lambda x: x, "ewma")
if cache_key in NUMBA_FUNC_CACHE:
return NUMBA_FUNC_CACHE[cache_key]

numba = import_optional_dependency("numba")

@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def groupby_ewma(
def ewma(
values: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
minimum_periods: int,
) -> np.ndarray:
result = np.empty(len(values))
alpha = 1.0 / (1.0 + com)
old_wt_factor = 1.0 - alpha
new_wt = 1.0 if adjust else alpha

for i in numba.prange(len(begin)):
start = begin[i]
stop = end[i]
window = values[start:stop]
sub_result = np.empty(len(window))

old_wt_factor = 1.0 - alpha
new_wt = 1.0 if adjust else alpha

weighted_avg = window[0]
nobs = int(not np.isnan(weighted_avg))
sub_result[0] = weighted_avg if nobs >= minimum_periods else np.nan
Expand Down Expand Up @@ -166,7 +166,7 @@ def groupby_ewma(

return result

return groupby_ewma
return ewma


def generate_numba_table_func(
Expand Down
40 changes: 27 additions & 13 deletions pandas/tests/window/test_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,30 +123,44 @@ def func_2(x):


@td.skip_if_no("numba", "0.46.0")
class TestGroupbyEWMMean:
def test_invalid_engine(self):
class TestEWMMean:
@pytest.mark.parametrize(
"grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"]
)
def test_invalid_engine(self, grouper):
df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)})
with pytest.raises(ValueError, match="engine must be either"):
df.groupby("A").ewm(com=1.0).mean(engine="foo")
grouper(df).ewm(com=1.0).mean(engine="foo")

def test_invalid_engine_kwargs(self):
@pytest.mark.parametrize(
"grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"]
)
def test_invalid_engine_kwargs(self, grouper):
df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)})
with pytest.raises(ValueError, match="cython engine does not"):
df.groupby("A").ewm(com=1.0).mean(
grouper(df).ewm(com=1.0).mean(
engine="cython", engine_kwargs={"nopython": True}
)

def test_cython_vs_numba(self, nogil, parallel, nopython, ignore_na, adjust):
@pytest.mark.parametrize(
"grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"]
)
def test_cython_vs_numba(
self, grouper, nogil, parallel, nopython, ignore_na, adjust
):
df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)})
gb_ewm = df.groupby("A").ewm(com=1.0, adjust=adjust, ignore_na=ignore_na)
ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na)

engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
result = gb_ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
expected = gb_ewm.mean(engine="cython")
result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
expected = ewm.mean(engine="cython")

tm.assert_frame_equal(result, expected)

def test_cython_vs_numba_times(self, nogil, parallel, nopython, ignore_na):
@pytest.mark.parametrize(
"grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"]
)
def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na):
# GH 40951
halflife = "23 days"
times = to_datetime(
Expand All @@ -160,13 +174,13 @@ def test_cython_vs_numba_times(self, nogil, parallel, nopython, ignore_na):
]
)
df = DataFrame({"A": ["a", "b", "a", "b", "b", "a"], "B": [0, 0, 1, 1, 2, 2]})
gb_ewm = df.groupby("A").ewm(
ewm = grouper(df).ewm(
halflife=halflife, adjust=True, ignore_na=ignore_na, times=times
)

engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
result = gb_ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
expected = gb_ewm.mean(engine="cython")
result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
expected = ewm.mean(engine="cython")

tm.assert_frame_equal(result, expected)

Expand Down

0 comments on commit d9b5d0b

Please sign in to comment.