diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index da7dcc6ab29b9..74534bc371094 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -192,6 +192,7 @@ Other enhancements - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`). +- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`) .. _whatsnew_120.api_breaking.python: diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index bba8b4b2432a9..131e0154ce8fc 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -311,37 +311,46 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x, - float64_t *ssqdm_x) nogil: + float64_t *ssqdm_x, float64_t *compensation) nogil: """ add a value from the var calc """ cdef: - float64_t delta + float64_t delta, prev_mean, y, t # `isnan` instead of equality as fix for GH-21813, msvc 2017 bug if isnan(val): return nobs[0] = nobs[0] + 1 - # a part of Welford's method for the online variance-calculation + # Welford's method for the online variance-calculation + # using Kahan summation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - delta = val - mean_x[0] + prev_mean = mean_x[0] - compensation[0] + y = val - compensation[0] + t = y - mean_x[0] + compensation[0] = t + mean_x[0] - y + delta = t mean_x[0] = mean_x[0] + delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] + ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0]) cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x, - float64_t *ssqdm_x) nogil: + float64_t *ssqdm_x, float64_t *compensation) nogil: """ remove a value from the var calc """ cdef: - float64_t delta - + float64_t delta, prev_mean, y, t if notnan(val): nobs[0] = nobs[0] - 1 if nobs[0]: - # a part of Welford's method for the online variance-calculation + # Welford's method for the online variance-calculation + # using Kahan summation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - delta = val - mean_x[0] + prev_mean = mean_x[0] - compensation[0] + y = val - compensation[0] + t = y - mean_x[0] + compensation[0] = t + mean_x[0] - y + delta = t mean_x[0] = mean_x[0] - delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] - ((nobs[0] + 1) * delta ** 2) / nobs[0] + ssqdm_x[0] = ssqdm_x[0] - (val - prev_mean) * (val - mean_x[0]) else: mean_x[0] = 0 ssqdm_x[0] = 0 @@ -353,7 +362,8 @@ def roll_var(ndarray[float64_t] values, ndarray[int64_t] start, Numerically stable implementation using Welford's method. """ cdef: - float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, + float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, compensation_add = 0, + float64_t compensation_remove = 0, float64_t val, prev, delta, mean_x_old int64_t s, e Py_ssize_t i, j, N = len(values) @@ -375,26 +385,28 @@ def roll_var(ndarray[float64_t] values, ndarray[int64_t] start, if i == 0 or not is_monotonic_bounds: for j in range(s, e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x) + add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add) else: # After the first window, observations can both be added # and removed - # calculate adds - for j in range(end[i - 1], e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x) - # calculate deletes for j in range(start[i - 1], s): - remove_var(values[j], &nobs, &mean_x, &ssqdm_x) + remove_var(values[j], &nobs, &mean_x, &ssqdm_x, + &compensation_remove) + + # calculate adds + for j in range(end[i - 1], e): + add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add) output[i] = calc_var(minp, ddof, nobs, ssqdm_x) if not is_monotonic_bounds: for j in range(s, e): - remove_var(values[j], &nobs, &mean_x, &ssqdm_x) + remove_var(values[j], &nobs, &mean_x, &ssqdm_x, + &compensation_remove) return output diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index a5fbc3c94786c..3d59f6cdd4996 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -879,3 +879,20 @@ def test_rolling_sem(constructor): result = pd.Series(result[0].values) expected = pd.Series([np.nan] + [0.707107] * 2) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + ("func", "third_value", "values"), + [ + ("var", 1, [5e33, 0, 0.5, 0.5, 2, 0]), + ("std", 1, [7.071068e16, 0, 0.7071068, 0.7071068, 1.414214, 0]), + ("var", 2, [5e33, 0.5, 0, 0.5, 2, 0]), + ("std", 2, [7.071068e16, 0.7071068, 0, 0.7071068, 1.414214, 0]), + ], +) +def test_rolling_var_numerical_issues(func, third_value, values): + # GH: 37051 + ds = pd.Series([99999999999999999, 1, third_value, 2, 3, 1, 1]) + result = getattr(ds.rolling(2), func)() + expected = pd.Series([np.nan] + values) + tm.assert_series_equal(result, expected)