diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 2d30e00142846e..7e707de998b878 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -3,12 +3,179 @@ v0.22.0 ------- -This is a major release from 0.21.1 and includes a number of API changes, -deprecations, new features, enhancements, and performance improvements along -with a large number of bug fixes. We recommend that all users upgrade to this -version. +This is a major release from 0.21.1 and includes a single, API breaking change. +We recommend that all users upgrade to this version after carefully reading the +release note (singular!). .. _whatsnew_0220.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas 0.22.0 changes the handling of empty and all-NA sums and products. The +summary is that + +* The sum of an all-NA or empty series is now 0 +* The product of an all-NA or empty series is now 1 +* We've added a ``min_count`` parameter to ``.sum`` and ``.prod`` to control + the minimum number of valid values for the result to be valid. If fewer than + ``min_count`` valid values are present, the result is NA. The default is + ``0``. To restore the 0.21 behavior, use ``min_count=1``. + +Some background: In pandas 0.21.1, we fixed a long-standing inconsistency +in the return value of all-NA series depending on whether or not bottleneck +was installed. See :ref:`whatsnew_0210.api_breaking.bottleneck`_. At the same +time, we changed the sum and prod of an empty Series to also be ``NaN``. + +Based on feedback, we've partially reverted those changes. The default sum +for all-NA and empty series is now 0 (1 for ``prod``). + +*pandas 0.21* + +.. code-block:: ipython + + In [1]: import pandas as pd + + In [2]: import numpy as np + + In [3]: pd.Series([]).sum() + Out[3]: nan + + In [4]: pd.Series([np.nan]).sum() + Out[4]: nan + +*pandas 0.22.0* + +.. ipython:: python + + pd.Series([]).sum() + pd.Series([np.nan]).sum() + +To have the sum of an empty series return ``NaN``, use the ``min_count`` +keyword. Thanks to the ``skipna`` parameter, the ``.sum`` on an all-NA +series is conceptually the same as on an empty. The ``min_count`` parameter +refers to the minimum number of *valid* values required for a non-NA sum +or product. + +.. ipython:: python + + pd.Series([]).sum(min_count=1) + pd.Series([np.nan]).sum(min_count=1) + +Note that this affects some other places in the library: + +1. Grouping by a Categorical with some unobserved categories + +*pandas 0.21* + +.. code-block:: ipython + + In [3]: grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) + + In [4]: pd.Series([1, 2]).groupby(grouper).sum() + Out[4]: + a 3.0 + b NaN + dtype: float64 + +*pandas 0.22* + +.. ipython:: python + + grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) + pd.Series([1, 2]).groupby(grouper).sum() + + pd.Series([1, 2]).groupby(groupuer).sum(min_count=1) + +2. Resampling + +The output for an all-NaN bin will change: + +*pandas 0.21.0* + +.. code-block:: ipython + + In [1]: import pandas as pd; import numpy as np; + + In [2]: s = pd.Series([1, 1, np.nan, np.nan], + ...: index=pd.date_range('2017', periods=4)) + ...: + + In [3]: s + Out[3]: + 2017-01-01 1.0 + 2017-01-02 1.0 + 2017-01-03 NaN + 2017-01-04 NaN + Freq: D, dtype: float64 + + In [4]: s.resample('2d').sum() + Out[4]: + 2017-01-01 2.0 + 2017-01-03 NaN + Freq: 2D, dtype: float64 + +*pandas 0.22.0* + +.. ipython:: python + + s = pd.Series([1, 1, np.nan, np.nan], + index=pd.date_range('2017', periods=4)) + s.resample('2d').sum() + +To restore the 0.21 behavior, use ``min_count>=1`` + +.. ipython:: python + + s.resample('2d').sum(min_count=1) + +Upsampling in particular is affected, as this will introduce all-NaN series even +if your original series was entirely valid. + +*pandas 0.21.0* + +.. code-block:: ipython + + In [5]: idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) + + In [6]: pd.Series([1, 2], index=idx).resample('12H').sum() + Out[6]: + 2017-01-01 00:00:00 1.0 + 2017-01-01 12:00:00 NaN + 2017-01-02 00:00:00 2.0 + Freq: 12H, dtype: float64 + +*pandas 0.22.0* + +.. ipython:: python + + idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) + pd.Series([1, 2], index=idx).resample("12H").sum() + + pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1) + +3. Rolling / Expanding + +Rolling and expanding already have a ``min_periods`` keyword that behaves +similarly to ``min_count``. The only case that changes is when doing a rolling +or expanding sum on an all-NaN series with ``min_periods=0``. + +*pandas 0.21.1* + +.. ipython:: python + + In [7]: s = pd.Series([np.nan, np.nan]) + + In [8]: s.rolling(2, min_periods=0).sum() + Out[8]: + 0 NaN + 1 NaN + dtype: float64 + +*pandas 0.22.0* + +.. ipython:: python + + In [2]: s = pd.Series([np.nan, np.nan]) + + In [3]: s.rolling(2, min_periods=0).sum() diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 16b7cbff44e034..14d47398ac1dff 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -37,7 +37,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, - Py_ssize_t min_count=1): + Py_ssize_t min_count=0): """ Only aggregates on axis=0 """ @@ -101,7 +101,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, - Py_ssize_t min_count=1): + Py_ssize_t min_count=0): """ Only aggregates on axis=0 """ diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index ecce45742afa79..c7ff4292941893 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -443,10 +443,17 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, double val, prev_x, sum_x = 0 int64_t s, e int64_t nobs = 0, i, j, N + int64_t minp2 = -1 bint is_variable ndarray[int64_t] start, end ndarray[double_t] output + if minp == 0: + # in get_window_indexer, we ensure that minp >= 1. That's fine for + # all cases except nobs = 0 (all missing values) and minp=0. For + # any other minp, the sum will be NA. For minp=0, the sum will be 0. + # So we track that here and pass it later if needed. + minp2 = 0 start, end, N, win, minp, is_variable = get_window_indexer(input, win, minp, index, closed) @@ -483,6 +490,8 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, for j in range(end[i - 1], e): add_sum(input[j], &nobs, &sum_x) + if minp2 == 0: + minp = 0 output[i] = calc_sum(minp, nobs, sum_x) else: @@ -503,6 +512,8 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, prev_x = input[i - win] remove_sum(prev_x, &nobs, &sum_x) + if minp2 == 0: + minp = 0 output[i] = calc_sum(minp, nobs, sum_x) return output diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 654d6fe1c27b81..ed1e8c961e51ad 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7619,48 +7619,48 @@ def _doc_parms(cls): _sum_examples = """\ Examples -------- -By default, the sum of an empty series is ``NaN``. +By default, the sum of an empty series is ``0``. ->>> pd.Series([]).sum() # min_count=1 is the default -nan +>>> pd.Series([]).sum() # min_count=0 is the default +0.0 This can be controlled with the ``min_count`` parameter. For example, if -you'd like the sum of an empty series to be 0, pass ``min_count=0``. +you'd like the sum of an empty series to be NaN, pass ``min_count=1``. ->>> pd.Series([]).sum(min_count=0) -0.0 +>>> pd.Series([]).sum(min_count=1) +nan Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and empty series identically. >>> pd.Series([np.nan]).sum() -nan - ->>> pd.Series([np.nan]).sum(min_count=0) 0.0 + +>>> pd.Series([np.nan]).sum(min_count=1) +nan """ _prod_examples = """\ Examples -------- -By default, the product of an empty series is ``NaN`` +By default, the product of an empty series is ``1`` >>> pd.Series([]).prod() -nan +1.0 This can be controlled with the ``min_count`` parameter ->>> pd.Series([]).prod(min_count=0) -1.0 +>>> pd.Series([]).prod(min_count=1) +nan Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and empty series identically. >>> pd.Series([np.nan]).prod() -nan - ->>> pd.Series([np.nan]).sum(min_count=0) 1.0 + +>>> pd.Series([np.nan]).sum(min_count=1) +nan """ @@ -7683,7 +7683,7 @@ def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc, examples=examples) @Appender(_num_doc) def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, - min_count=1, + min_count=0, **kwargs): nv.validate_stat_func(tuple(), kwargs, fname=name) if skipna is None: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c809d8a6167c87..bec387a3169f24 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1286,8 +1286,8 @@ def last(x): else: return last(x) - cls.sum = groupby_function('sum', 'add', np.sum, min_count=1) - cls.prod = groupby_function('prod', 'prod', np.prod, min_count=1) + cls.sum = groupby_function('sum', 'add', np.sum, min_count=0) + cls.prod = groupby_function('prod', 'prod', np.prod, min_count=0) cls.min = groupby_function('min', 'min', np.min, numeric_only=False) cls.max = groupby_function('max', 'max', np.max, numeric_only=False) cls.first = groupby_function('first', 'first', first_compat, diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 88f69f6ff2e14e..6868e09c2c2e9e 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -308,7 +308,7 @@ def nanall(values, axis=None, skipna=True): @disallow('M8') @bottleneck_switch() -def nansum(values, axis=None, skipna=True, min_count=1): +def nansum(values, axis=None, skipna=True, min_count=0): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max if is_float_dtype(dtype): @@ -645,7 +645,7 @@ def nankurt(values, axis=None, skipna=True): @disallow('M8', 'm8') -def nanprod(values, axis=None, skipna=True, min_count=1): +def nanprod(values, axis=None, skipna=True, min_count=0): mask = isna(values) if skipna and not is_any_int_dtype(values): values = values.copy() diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 4bc5b68bb36f0a..cea454fbfc6fa1 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -605,7 +605,7 @@ def size(self): # downsample methods for method in ['sum', 'prod']: - def f(self, _method=method, min_count=1, *args, **kwargs): + def f(self, _method=method, min_count=0, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) return self._downsample(_method, min_count=min_count) f.__doc__ = getattr(GroupBy, method).__doc__ diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 80e9acd0d22811..60089b0277b545 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -478,7 +478,8 @@ def test_nunique(self): Series({0: 1, 1: 3, 2: 2})) def test_sum(self): - self._check_stat_op('sum', np.sum, has_numeric_only=True) + self._check_stat_op('sum', np.sum, has_numeric_only=True, + skipna_alternative=np.nansum) # mixed types (with upcasting happening) self._check_stat_op('sum', np.sum, @@ -753,7 +754,8 @@ def alt(x): def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, has_numeric_only=False, check_dtype=True, - check_dates=False, check_less_precise=False): + check_dates=False, check_less_precise=False, + skipna_alternative=None): if frame is None: frame = self.frame # set some NAs @@ -774,15 +776,19 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, assert len(result) if has_skipna: - def skipna_wrapper(x): - nona = x.dropna() - if len(nona) == 0: - return np.nan - return alternative(nona) - def wrapper(x): return alternative(x.values) + if skipna_alternative: + def skipna_wrapper(x): + return skipna_alternative(x.values) + else: + def skipna_wrapper(x): + nona = x.dropna() + if len(nona) == 0: + return np.nan + return alternative(nona) + result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) tm.assert_series_equal(result0, frame.apply(wrapper), @@ -834,8 +840,11 @@ def wrapper(x): r0 = getattr(all_na, name)(axis=0) r1 = getattr(all_na, name)(axis=1) if name in ['sum', 'prod']: - assert np.isnan(r0).all() - assert np.isnan(r1).all() + unit = int(name == 'prod') + expected = pd.Series(unit, index=r0.index, dtype=r0.dtype) + tm.assert_series_equal(r0, expected) + expected = pd.Series(unit, index=r1.index, dtype=r1.dtype) + tm.assert_series_equal(r1, expected) def test_mode(self): df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11], diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 07ecc085098bf7..cca21fddd116e1 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -813,8 +813,6 @@ def test__cython_agg_general(self): ('mean', np.mean), ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), ('var', lambda x: np.var(x, ddof=1)), - ('add', lambda x: np.sum(x) if len(x) > 0 else np.nan), - ('prod', np.prod), ('min', np.min), ('max', np.max), ] ) @@ -824,12 +822,7 @@ def test_cython_agg_empty_buckets(self, op, targop): # calling _cython_agg_general directly, instead of via the user API # which sets different values for min_count, so do that here. - if op in ('add', 'prod'): - min_count = 1 - else: - min_count = -1 - result = df.groupby(pd.cut(df[0], grps))._cython_agg_general( - op, min_count=min_count) + result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) try: tm.assert_frame_equal(result, expected) @@ -837,6 +830,40 @@ def test_cython_agg_empty_buckets(self, op, targop): exc.args += ('operation: %s' % op,) raise + def test_cython_agg_empty_buckets_nanops(self): + # GH-18869 can't call nanops on empty groups, so hardcode expected + # for these + df = pd.DataFrame([11, 12, 13], columns=['a']) + grps = range(0, 25, 5) + # add / sum + result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add') + intervals = pd.interval_range(0, 20, freq=5) + expected = pd.DataFrame( + {"a": [0, 0, 36, 0]}, + index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + tm.assert_frame_equal(result, expected) + + # prod + result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod') + expected = pd.DataFrame( + {"a": [1, 1, 1716, 1]}, + index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.") + def test_agg_category_nansum(self): + categories = ['a', 'b', 'c'] + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=categories), + 'B': [1, 2, 3]}) + result = df.groupby("A").B.agg(np.nansum) + expected = pd.Series([3, 3, 0], + index=pd.CategoricalIndex(['a', 'b', 'c'], + categories=categories, + name='A'), + name='B') + tm.assert_series_equal(result, expected) + def test_agg_over_numpy_arrays(self): # GH 3788 df = pd.DataFrame([[1, np.array([10, 20, 30])], diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5e3d2bb9cf091b..1713b2d3015ad2 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -37,7 +37,7 @@ def test_groupby(self): # single grouper gb = df.groupby("A") exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) - expected = DataFrame({'values': Series([3, 7, np.nan], index=exp_idx)}) + expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) @@ -670,9 +670,9 @@ def test_empty_sum(self): 'B': [1, 2, 1]}) expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') - # NA by default + # 0 by default result = df.groupby("A").B.sum() - expected = pd.Series([3, 1, np.nan], expected_idx, name='B') + expected = pd.Series([3, 1, 0], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 @@ -693,9 +693,9 @@ def test_empty_prod(self): expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') - # NA by default + # 1 by default result = df.groupby("A").B.prod() - expected = pd.Series([2, 1, np.nan], expected_idx, name='B') + expected = pd.Series([2, 1, 1], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index cf4a6ec1c932ac..a13d985ab6974a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2704,7 +2704,7 @@ def h(df, arg3): # Assert the results here index = pd.Index(['A', 'B', 'C'], name='group') - expected = pd.Series([-79.5160891089, -78.4839108911, None], + expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index c8503b16a0e16a..0582d336c87dd9 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -273,6 +273,19 @@ def test_timegrouper_with_reg_groups(self): 'whole_cost'].sum() assert_series_equal(result2, expected) + def test_it(self): + df = pd.DataFrame({ + 'A': [1] * 6 + [2] * 6, + 'B': pd.to_datetime(['2017'] * 3 + + ['2019'] * 3 + + ['2017'] * 3 + + ['2019'] * 3), + 'C': [1] * 12 + }) + a = df.groupby([pd.Grouper(key='B', freq='AS'), 'A']).C.count() + b = df.set_index("B").sort_index().groupby("A").resample("AS").C.count() + + def test_timegrouper_get_group(self): # GH 6914 diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 1706b7bc7df160..b98949bf95a7e3 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -41,7 +41,7 @@ def test_empty(self, method, unit, use_bottleneck): s = Series([]) # NA by default result = getattr(s, method)() - assert isna(result) + assert result == unit # Explict result = getattr(s, method)(min_count=0) @@ -52,7 +52,7 @@ def test_empty(self, method, unit, use_bottleneck): # Skipna, default result = getattr(s, method)(skipna=True) - assert isna(result) + result == unit # Skipna, explicit result = getattr(s, method)(skipna=True, min_count=0) @@ -65,7 +65,7 @@ def test_empty(self, method, unit, use_bottleneck): s = Series([np.nan]) # NA by default result = getattr(s, method)() - assert isna(result) + assert result == unit # Explicit result = getattr(s, method)(min_count=0) @@ -76,7 +76,7 @@ def test_empty(self, method, unit, use_bottleneck): # Skipna, default result = getattr(s, method)(skipna=True) - assert isna(result) + result == unit # skipna, explicit result = getattr(s, method)(skipna=True, min_count=0) @@ -110,7 +110,7 @@ def test_empty(self, method, unit, use_bottleneck): # GH #844 (changed in 9422) df = DataFrame(np.empty((10, 0))) - assert (df.sum(1).isnull()).all() + assert (getattr(df, method)(1) == unit).all() s = pd.Series([1]) result = getattr(s, method)(min_count=2) @@ -131,9 +131,9 @@ def test_empty(self, method, unit, use_bottleneck): def test_empty_multi(self, method, unit): s = pd.Series([1, np.nan, np.nan, np.nan], index=pd.MultiIndex.from_product([('a', 'b'), (0, 1)])) - # NaN by default + # 1 / 0 by default result = getattr(s, method)(level=0) - expected = pd.Series([1, np.nan], index=['a', 'b']) + expected = pd.Series([1, unit], index=['a', 'b']) tm.assert_series_equal(result, expected) # min_count=0 @@ -147,7 +147,7 @@ def test_empty_multi(self, method, unit): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "method", ['sum', 'mean', 'median', 'std', 'var']) + "method", ['mean', 'median', 'std', 'var']) def test_ops_consistency_on_empty(self, method): # GH 7869 @@ -195,7 +195,7 @@ def test_sum_overflow(self, use_bottleneck): assert np.allclose(float(result), v[-1]) def test_sum(self): - self._check_stat_op('sum', np.sum, check_allna=True) + self._check_stat_op('sum', np.sum, check_allna=False) def test_sum_inf(self): s = Series(np.random.randn(10)) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index 14a44c36c6a0c4..3c93ff1d3f31eb 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -38,7 +38,7 @@ def test_quantile(self): # GH7661 result = Series([np.timedelta64('NaT')]).sum() - assert result is pd.NaT + assert result == pd.Timedelta(0) msg = 'percentiles should all be in the interval \\[0, 1\\]' for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index d03ecb9f9b5b79..306d063b364fe3 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -182,12 +182,17 @@ def _coerce_tds(targ, res): check_dtype=check_dtype) def check_fun_data(self, testfunc, targfunc, testarval, targarval, - targarnanval, check_dtype=True, **kwargs): + targarnanval, check_dtype=True, empty_targfunc=None, + **kwargs): for axis in list(range(targarval.ndim)) + [None]: for skipna in [False, True]: targartempval = targarval if skipna else targarnanval - try: + if skipna and empty_targfunc and pd.isna(targartempval).all(): + targ = empty_targfunc(targartempval, axis=axis, **kwargs) + else: targ = targfunc(targartempval, axis=axis, **kwargs) + + try: res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) self.check_results(targ, res, axis, @@ -219,10 +224,11 @@ def check_fun_data(self, testfunc, targfunc, testarval, targarval, except ValueError: return self.check_fun_data(testfunc, targfunc, testarval2, targarval2, - targarnanval2, check_dtype=check_dtype, **kwargs) + targarnanval2, check_dtype=check_dtype, + empty_targfunc=empty_targfunc, **kwargs) def check_fun(self, testfunc, targfunc, testar, targar=None, - targarnan=None, **kwargs): + targarnan=None, empty_targfunc=None, **kwargs): if targar is None: targar = testar if targarnan is None: @@ -232,7 +238,8 @@ def check_fun(self, testfunc, targfunc, testar, targar=None, targarnanval = getattr(self, targarnan) try: self.check_fun_data(testfunc, targfunc, testarval, targarval, - targarnanval, **kwargs) + targarnanval, empty_targfunc=empty_targfunc, + **kwargs) except BaseException as exc: exc.args += ('testar: %s' % testar, 'targar: %s' % targar, 'targarnan: %s' % targarnan) @@ -329,7 +336,8 @@ def test_nanall(self): def test_nansum(self): self.check_funs(nanops.nansum, np.sum, allow_str=False, - allow_date=False, allow_tdelta=True, check_dtype=False) + allow_date=False, allow_tdelta=True, check_dtype=False, + empty_targfunc=np.nansum) def test_nanmean(self): self.check_funs(nanops.nanmean, np.mean, allow_complex=False, @@ -463,7 +471,8 @@ def test_nankurt(self): def test_nanprod(self): self.check_funs(nanops.nanprod, np.prod, allow_str=False, - allow_date=False, allow_tdelta=False) + allow_date=False, allow_tdelta=False, + empty_targfunc=np.nanprod) def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, **kwargs) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 34c1ee56831839..6ea9fdff724aa5 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -83,13 +83,13 @@ def test_count(self): self._check_stat_op('count', f, obj=self.panel, has_skipna=False) def test_sum(self): - self._check_stat_op('sum', np.sum) + self._check_stat_op('sum', np.sum, skipna_alternative=np.nansum) def test_mean(self): self._check_stat_op('mean', np.mean) def test_prod(self): - self._check_stat_op('prod', np.prod) + self._check_stat_op('prod', np.prod, skipna_alternative=np.nanprod) def test_median(self): def wrapper(x): @@ -140,7 +140,8 @@ def alt(x): self._check_stat_op('sem', alt) - def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): + def _check_stat_op(self, name, alternative, obj=None, has_skipna=True, + skipna_alternative=None): if obj is None: obj = self.panel @@ -152,11 +153,15 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): if has_skipna: - def skipna_wrapper(x): - nona = remove_na_arraylike(x) - if len(nona) == 0: - return np.nan - return alternative(nona) + if skipna_alternative: + def skipna_wrapper(x): + return skipna_alternative(np.asarray(x)) + else: + def skipna_wrapper(x): + nona = remove_na_arraylike(x) + if len(nona) == 0: + return np.nan + return alternative(nona) def wrapper(x): return alternative(np.asarray(x)) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index e194136ec716df..89876c3215b0d1 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -38,13 +38,13 @@ def test_count(self): self._check_stat_op('count', f, obj=self.panel4d, has_skipna=False) def test_sum(self): - self._check_stat_op('sum', np.sum) + self._check_stat_op('sum', np.sum, skipna_alternative=np.nansum) def test_mean(self): self._check_stat_op('mean', np.mean) def test_prod(self): - self._check_stat_op('prod', np.prod) + self._check_stat_op('prod', np.prod, skipna_alternative=np.nanprod) def test_median(self): def wrapper(x): @@ -105,7 +105,8 @@ def alt(x): # self._check_stat_op('skew', alt) - def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): + def _check_stat_op(self, name, alternative, obj=None, has_skipna=True, + skipna_alternative=None): if obj is None: obj = self.panel4d @@ -116,11 +117,16 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): f = getattr(obj, name) if has_skipna: - def skipna_wrapper(x): - nona = remove_na_arraylike(x) - if len(nona) == 0: - return np.nan - return alternative(nona) + + if skipna_alternative: + def skipna_wrapper(x): + return skipna_alternative(np.asarray(x)) + else: + def skipna_wrapper(x): + nona = remove_na_arraylike(x) + if len(nona) == 0: + return np.nan + return alternative(nona) def wrapper(x): return alternative(np.asarray(x)) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index c9b3504b427805..4402faa43fabba 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3375,9 +3375,9 @@ def test_aggregate_normal(self): def test_resample_entirly_nat_window(self, method, unit): s = pd.Series([0] * 2 + [np.nan] * 2, index=pd.date_range('2017', periods=4)) - # nan by default + # 0 / 1 by default result = methodcaller(method)(s.resample("2d")) - expected = pd.Series([0.0, np.nan], + expected = pd.Series([0.0, unit], index=pd.to_datetime(['2017-01-01', '2017-01-03'])) tm.assert_series_equal(result, expected) @@ -3414,7 +3414,15 @@ def test_aggregate_with_nat(self): for func in ['min', 'max', 'sum', 'prod']: normal_result = getattr(normal_grouped, func)() dt_result = getattr(dt_grouped, func)() - pad = DataFrame([[np.nan, np.nan, np.nan, np.nan]], index=[3], + + if func == 'sum': + fill_value = 0 + elif func == 'prod': + fill_value = 1 + else: + fill_value = np.nan + + pad = DataFrame([[fill_value] * 4], index=[3], columns=['A', 'B', 'C', 'D']) expected = normal_result.append(pad) expected = expected.sort_index() @@ -3467,9 +3475,9 @@ def test_upsample_sum(self, method, unit): '2017-01-01T00:30:00', '2017-01-01T01:00:00']) - # NaN by default + # 0 / 1 by default result = methodcaller(method)(resampled) - expected = pd.Series([1, np.nan, 1], index=index) + expected = pd.Series([1, unit, 1], index=index) tm.assert_series_equal(result, expected) # min_count=0 diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index bee925823eebe0..995cf033d586a2 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -439,6 +439,26 @@ def tests_empty_df_rolling(self, roller): result = DataFrame(index=pd.DatetimeIndex([])).rolling(roller).sum() tm.assert_frame_equal(result, expected) + def test_missing_minp_zero(self): + # minp=0 + x = pd.Series([np.nan]) + result = x.rolling(1, min_periods=0).sum() + expected = pd.Series([0.0]) + tm.assert_series_equal(result, expected) + + # minp=1 + result = x.rolling(1, min_periods=1).sum() + expected = pd.Series([np.nan]) + tm.assert_series_equal(result, expected) + + def test_missing_minp_zero_variable(self): + x = pd.Series([np.nan] * 4, + index=pd.DatetimeIndex(['2017-01-01', '2017-01-04', + '2017-01-06', '2017-01-07'])) + result = x.rolling(pd.Timedelta("2d"), min_periods=0).sum() + expected = pd.Series(0.0, index=x.index) + tm.assert_series_equal(result, expected) + def test_multi_index_names(self): # GH 16789, 16825 @@ -512,6 +532,18 @@ def test_empty_df_expanding(self, expander): index=pd.DatetimeIndex([])).expanding(expander).sum() tm.assert_frame_equal(result, expected) + def test_missing_minp_zero(self): + # minp=0 + x = pd.Series([np.nan]) + result = x.expanding(min_periods=0).sum() + expected = pd.Series([0.0]) + tm.assert_series_equal(result, expected) + + # minp=1 + result = x.expanding(min_periods=1).sum() + expected = pd.Series([np.nan]) + tm.assert_series_equal(result, expected) + class TestEWM(Base): @@ -828,7 +860,7 @@ def test_centered_axis_validation(self): .rolling(window=3, center=True, axis=2).mean()) def test_rolling_sum(self): - self._check_moment_func(mom.rolling_sum, np.sum, name='sum') + self._check_moment_func(mom.rolling_sum, np.nansum, name='sum') def test_rolling_count(self): counter = lambda x: np.isfinite(x).astype(float).sum() @@ -1358,9 +1390,10 @@ def get_result(arr, window, min_periods=None, center=False): assert notna(result[4]) # min_periods=0 - result0 = get_result(arr, 20, min_periods=0) - result1 = get_result(arr, 20, min_periods=1) - tm.assert_almost_equal(result0, result1) + # This isn't true + # result0 = get_result(arr, 20, min_periods=0) + # result1 = get_result(arr, 20, min_periods=1) + # tm.assert_almost_equal(result0, result1) else: result = get_result(arr, 50) tm.assert_almost_equal(result[-1], static_comp(arr[10:-10]))