Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Use Kahan summation and Welfords method to calculate rolling var and std #37055

Merged
merged 10 commits into from
Oct 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ Other enhancements
- Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
- Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`).
- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not fully true, but its better so ok.


.. _whatsnew_120.api_breaking.python:

Expand Down
50 changes: 31 additions & 19 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -311,37 +311,46 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,


cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
float64_t *ssqdm_x) nogil:
float64_t *ssqdm_x, float64_t *compensation) nogil:
""" add a value from the var calc """
cdef:
float64_t delta
float64_t delta, prev_mean, y, t

# `isnan` instead of equality as fix for GH-21813, msvc 2017 bug
if isnan(val):
return

nobs[0] = nobs[0] + 1
# a part of Welford's method for the online variance-calculation
# Welford's method for the online variance-calculation
# using Kahan summation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
delta = val - mean_x[0]
prev_mean = mean_x[0] - compensation[0]
y = val - compensation[0]
t = y - mean_x[0]
compensation[0] = t + mean_x[0] - y
delta = t
mean_x[0] = mean_x[0] + delta / nobs[0]
ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0]
ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0])


cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x,
float64_t *ssqdm_x) nogil:
float64_t *ssqdm_x, float64_t *compensation) nogil:
""" remove a value from the var calc """
cdef:
float64_t delta

float64_t delta, prev_mean, y, t
if notnan(val):
nobs[0] = nobs[0] - 1
if nobs[0]:
# a part of Welford's method for the online variance-calculation
# Welford's method for the online variance-calculation
# using Kahan summation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
delta = val - mean_x[0]
prev_mean = mean_x[0] - compensation[0]
y = val - compensation[0]
t = y - mean_x[0]
compensation[0] = t + mean_x[0] - y
delta = t
mean_x[0] = mean_x[0] - delta / nobs[0]
ssqdm_x[0] = ssqdm_x[0] - ((nobs[0] + 1) * delta ** 2) / nobs[0]
ssqdm_x[0] = ssqdm_x[0] - (val - prev_mean) * (val - mean_x[0])
else:
mean_x[0] = 0
ssqdm_x[0] = 0
Expand All @@ -353,7 +362,8 @@ def roll_var(ndarray[float64_t] values, ndarray[int64_t] start,
Numerically stable implementation using Welford's method.
"""
cdef:
float64_t mean_x = 0, ssqdm_x = 0, nobs = 0,
float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, compensation_add = 0,
float64_t compensation_remove = 0,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't there a line in this func that you need to remove

eg the < 1e-14

?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See below

float64_t val, prev, delta, mean_x_old
int64_t s, e
Py_ssize_t i, j, N = len(values)
Expand All @@ -375,26 +385,28 @@ def roll_var(ndarray[float64_t] values, ndarray[int64_t] start,
if i == 0 or not is_monotonic_bounds:

for j in range(s, e):
add_var(values[j], &nobs, &mean_x, &ssqdm_x)
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)

else:

# After the first window, observations can both be added
# and removed

# calculate adds
for j in range(end[i - 1], e):
add_var(values[j], &nobs, &mean_x, &ssqdm_x)

# calculate deletes
for j in range(start[i - 1], s):
remove_var(values[j], &nobs, &mean_x, &ssqdm_x)
remove_var(values[j], &nobs, &mean_x, &ssqdm_x,
&compensation_remove)

# calculate adds
for j in range(end[i - 1], e):
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)

output[i] = calc_var(minp, ddof, nobs, ssqdm_x)

if not is_monotonic_bounds:
for j in range(s, e):
remove_var(values[j], &nobs, &mean_x, &ssqdm_x)
remove_var(values[j], &nobs, &mean_x, &ssqdm_x,
&compensation_remove)

return output

Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,3 +879,20 @@ def test_rolling_sem(constructor):
result = pd.Series(result[0].values)
expected = pd.Series([np.nan] + [0.707107] * 2)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
("func", "third_value", "values"),
[
("var", 1, [5e33, 0, 0.5, 0.5, 2, 0]),
("std", 1, [7.071068e16, 0, 0.7071068, 0.7071068, 1.414214, 0]),
("var", 2, [5e33, 0.5, 0, 0.5, 2, 0]),
("std", 2, [7.071068e16, 0.7071068, 0, 0.7071068, 1.414214, 0]),
],
)
def test_rolling_var_numerical_issues(func, third_value, values):
# GH: 37051
ds = pd.Series([99999999999999999, 1, third_value, 2, 3, 1, 1])
result = getattr(ds.rolling(2), func)()
expected = pd.Series([np.nan] + values)
tm.assert_series_equal(result, expected)