diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9c8968f7f8223..8190c80d774bc 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -299,6 +299,24 @@ cast to ``dtype=object`` (:issue:`38709`) ser ser2 + +.. _whatsnew_130.notable_bug_fixes.rolling_var_precision: + +Removed artificial truncation in rolling variance and standard deviation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`core.window.Rolling.std` and :meth:`core.window.Rolling.var` will no longer +artificially truncate results that are less than ``~1e-8`` and ``~1e-15`` respectively to +zero (:issue:`37051`, :issue:`40448`, :issue:`39872`). + +However, floating point artifacts may now exist in the results when rolling over larger values. + +.. ipython:: python + + s = pd.Series([7, 5, 5, 5]) + s.rolling(3).var() + + .. _whatsnew_130.api_breaking.deps: Increased minimum versions for dependencies diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index efacfad40ef82..46041b6a37a17 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -283,10 +283,6 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, result = 0 else: result = ssqdm_x / (nobs - ddof) - # Fix for numerical imprecision. - # Can be result < 0 once Kahan Summation is implemented - if result < 1e-14: - result = 0 else: result = NaN diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 6db86b940737e..0fa49dccda573 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1882,7 +1882,10 @@ def median( The default ``ddof`` of 1 used in :meth:`Series.std` is different than the default ``ddof`` of 0 in :func:`numpy.std`. - A minimum of one period is required for the rolling calculation.\n + A minimum of one period is required for the rolling calculation. + + The implementation is susceptible to floating point imprecision as + shown in the example below.\n """ ).replace("\n", "", 1), create_section_header("Examples"), @@ -1890,13 +1893,13 @@ def median( """ >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) >>> s.rolling(3).std() - 0 NaN - 1 NaN - 2 0.577350 - 3 1.000000 - 4 1.000000 - 5 1.154701 - 6 0.000000 + 0 NaN + 1 NaN + 2 5.773503e-01 + 3 1.000000e+00 + 4 1.000000e+00 + 5 1.154701e+00 + 6 2.580957e-08 dtype: float64 """ ).replace("\n", "", 1), @@ -1931,7 +1934,10 @@ def std(self, ddof: int = 1, *args, **kwargs): The default ``ddof`` of 1 used in :meth:`Series.var` is different than the default ``ddof`` of 0 in :func:`numpy.var`. - A minimum of one period is required for the rolling calculation.\n + A minimum of one period is required for the rolling calculation. + + The implementation is susceptible to floating point imprecision as + shown in the example below.\n """ ).replace("\n", "", 1), create_section_header("Examples"), @@ -1939,13 +1945,13 @@ def std(self, ddof: int = 1, *args, **kwargs): """ >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) >>> s.rolling(3).var() - 0 NaN - 1 NaN - 2 0.333333 - 3 1.000000 - 4 1.000000 - 5 1.333333 - 6 0.000000 + 0 NaN + 1 NaN + 2 3.333333e-01 + 3 1.000000e+00 + 4 1.000000e+00 + 5 1.333333e+00 + 6 6.661338e-16 dtype: float64 """ ).replace("\n", "", 1), diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 70c076e086fb7..0af0bba5f5f8c 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1150,3 +1150,25 @@ def test_rolling_descending_date_order_with_offset(window, frame_or_series): idx = date_range(start="2020-01-03", end="2020-01-01", freq="-1d") expected = frame_or_series([np.nan, 3, 2], index=idx) tm.assert_equal(result, expected) + + +def test_rolling_var_floating_artifact_precision(): + # GH 37051 + s = Series([7, 5, 5, 5]) + result = s.rolling(3).var() + expected = Series([np.nan, np.nan, 4 / 3, 0]) + tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15) + + +def test_rolling_std_small_values(): + # GH 37051 + s = Series( + [ + 0.00000054, + 0.00000053, + 0.00000054, + ] + ) + result = s.rolling(2).std() + expected = Series([np.nan, 7.071068e-9, 7.071068e-9]) + tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15)