Skip to content

Commit

Permalink
ENH: numerically stable roll_skew and roll_kurt
Browse files Browse the repository at this point in the history
  • Loading branch information
jaimefrio committed Nov 18, 2015
1 parent d6a7700 commit d68300a
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 140 deletions.
205 changes: 83 additions & 122 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1331,144 +1331,105 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):

return output

#----------------------------------------------------------------------
# Rolling skewness and kurtosis

#-------------------------------------------------------------------------------
# Rolling skewness
@cython.boundscheck(False)
@cython.wraparound(False)
def roll_skew(ndarray[double_t] input, int win, int minp):
def roll_higher_moment(ndarray[double_t] input, int win, int minp, bint kurt):
"""
Numerically stable implementation of skewness and kurtosis using a
Welford-like method. If `kurt` is True, rolling kurtosis is computed,
if False, rolling skewness.
"""
cdef double val, prev
cdef double x = 0, xx = 0, xxx = 0
cdef Py_ssize_t nobs = 0, i
cdef Py_ssize_t N = len(input)
cdef double mean_x = 0, s2dm_x = 0, s3dm_x = 0, s4dm_x = 0, rep = NaN
cdef double delta, delta_n, tmp
cdef Py_ssize_t i, nobs = 0, nrep = 0, N = len(input)

cdef ndarray[double_t] output = np.empty(N, dtype=float)

# 3 components of the skewness equation
cdef double A, B, C, R

minp = _check_minp(win, minp, N)
with nogil:
for i from 0 <= i < minp - 1:
val = input[i]
minobs = max(minp, 4 if kurt else 3)

# Not NaN
if val == val:
nobs += 1
x += val
xx += val * val
xxx += val * val * val

output[i] = NaN

for i from minp - 1 <= i < N:
val = input[i]

if val == val:
nobs += 1
x += val
xx += val * val
xxx += val * val * val

if i > win - 1:
prev = input[i - win]
if prev == prev:
x -= prev
xx -= prev * prev
xxx -= prev * prev * prev

nobs -= 1
if nobs >= minp:
A = x / nobs
B = xx / nobs - A * A
C = xxx / nobs - A * A * A - 3 * A * B
if B <= 0 or nobs < 3:
output[i] = NaN
for i from 0 <= i < N:
val = input[i]
prev = NaN if i < win else input[i - win]

if prev == prev:
# prev is not NaN, remove an observation...
nobs -= 1
if nobs < nrep:
# ...all non-NaN values were identical, remove a repeat
nrep -= 1
if nobs == nrep:
# We can get here both if all non-NaN were already identical
# or if nobs == 1 after removing the observation
if nrep == 0:
rep = NaN
mean_x = 0
else:
R = sqrt(B)
output[i] = ((sqrt(nobs * (nobs - 1.)) * C) /
((nobs-2) * R * R * R))
mean_x = rep
# This is redundant most of the time
s2dm_x = s3dm_x = s4dm_x = 0
else:
output[i] = NaN

return output

#-------------------------------------------------------------------------------
# Rolling kurtosis
@cython.boundscheck(False)
@cython.wraparound(False)
def roll_kurt(ndarray[double_t] input,
int win, int minp):
cdef double val, prev
cdef double x = 0, xx = 0, xxx = 0, xxxx = 0
cdef Py_ssize_t nobs = 0, i
cdef Py_ssize_t N = len(input)

cdef ndarray[double_t] output = np.empty(N, dtype=float)

# 5 components of the kurtosis equation
cdef double A, B, C, D, R, K

minp = _check_minp(win, minp, N)
with nogil:
for i from 0 <= i < minp - 1:
val = input[i]

# Not NaN
if val == val:
nobs += 1

# seriously don't ask me why this is faster
x += val
xx += val * val
xxx += val * val * val
xxxx += val * val * val * val

output[i] = NaN

for i from minp - 1 <= i < N:
val = input[i]

if val == val:
nobs += 1
x += val
xx += val * val
xxx += val * val * val
xxxx += val * val * val * val

if i > win - 1:
prev = input[i - win]
if prev == prev:
x -= prev
xx -= prev * prev
xxx -= prev * prev * prev
xxxx -= prev * prev * prev * prev

nobs -= 1

if nobs >= minp:
A = x / nobs
R = A * A
B = xx / nobs - R
R = R * A
C = xxx / nobs - R - 3 * A * B
R = R * A
D = xxxx / nobs - R - 6*B*A*A - 4*C*A

if B == 0 or nobs < 4:
output[i] = NaN

else:
K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2)
K = K / ((nobs - 2.)*(nobs-3.))

output[i] = K
# ...update mean and sums of raised differences from mean
delta = prev - mean_x
delta_n = delta / nobs
tmp = delta * delta_n * (nobs + 1)
if kurt:
s4dm_x -= ((tmp * ((nobs + 3) * nobs + 3) -
6 * s2dm_x) * delta_n - 4 * s3dm_x) * delta_n
s3dm_x -= (tmp * (nobs + 2) - 3 * s2dm_x) * delta_n
s2dm_x -= tmp
mean_x -= delta_n

if val == val:
# val is not NaN, adding an observation...
nobs += 1
if val == rep:
# ...and adding a repeat
nrep += 1
else:
output[i] = NaN
# ...and resetting repeats
nrep = 1
rep = val
if nobs == nrep:
# ...all non-NaN values are identical
mean_x = rep
s2dm_x = s3dm_x = s4dm_x = 0
else:
# ...update mean and sums of raised differences from mean
delta = val - mean_x
delta_n = delta / nobs
tmp = delta * delta_n * (nobs - 1)
if kurt:
s4dm_x += ((tmp * ((nobs - 3) * nobs + 3) +
6 * s2dm_x) * delta_n - 4 * s3dm_x) * delta_n
s3dm_x += (tmp * (nobs - 2) - 3 * s2dm_x) * delta_n
s2dm_x += tmp
mean_x += delta_n

# Sums of even powers must be positive
if s2dm_x < 0 or s4dm_x < 0:
s2dm_x = s3dm_x = s4_dm_x = 0

if nobs < minobs or s2dm_x == 0:
output[i] = NaN
elif kurt:
# multiplications are cheap, divisions are not
tmp = s2dm_x * s2dm_x
output[i] = (nobs - 1) * (nobs * (nobs + 1) * s4dm_x -
3 * (nobs - 1) * tmp)
output[i] /= tmp * (nobs - 2) * (nobs - 3)
else:
# multiplications are cheap, divisions and square roots are not
tmp = (nobs - 2) * (nobs - 2) * s2dm_x * s2dm_x * s2dm_x
output[i] = s3dm_x * nobs * sqrt((nobs - 1) / tmp)

return output


#-------------------------------------------------------------------------------
# Rolling median, min, max

Expand Down
52 changes: 38 additions & 14 deletions pandas/stats/moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,8 @@ def rolling_corr_pairwise(df1, df2=None, window=None, min_periods=None,


def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False,
how=None, args=(), kwargs={}, **kwds):
how=None, args=(), kwargs={}, center_data=False,
norm_data=False, **kwds):
"""
Rolling statistical measure using supplied function. Designed to be
used with passed-in Cython array-based functions.
Expand All @@ -378,15 +379,21 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False,
Passed on to func
kwargs : dict
Passed on to func
center_data : bool
If True, subtract the mean of the data from the values
norm_data: bool
If True, subtract the mean of the data from the values, and divide
by their standard deviation.
Returns
-------
y : type of input
"""
arg = _conv_timerule(arg, freq, how)

return_hook, values = _process_data_structure(arg)

return_hook, values = _process_data_structure(arg,
center_data=center_data,
norm_data=norm_data)
if values.size == 0:
result = values.copy()
else:
Expand Down Expand Up @@ -423,7 +430,8 @@ def _center_window(rs, window, axis):
return rs


def _process_data_structure(arg, kill_inf=True):
def _process_data_structure(arg, kill_inf=True, center_data=False,
norm_data=False):
if isinstance(arg, DataFrame):
return_hook = lambda v: type(arg)(v, index=arg.index,
columns=arg.columns)
Expand All @@ -438,9 +446,15 @@ def _process_data_structure(arg, kill_inf=True):
if not issubclass(values.dtype.type, float):
values = values.astype(float)

if kill_inf:
if kill_inf or center_data or norm_data:
values = values.copy()
values[np.isinf(values)] = np.NaN
mask = np.isfinite(values)
if kill_inf:
values[~mask] = np.NaN
if center_data or norm_data:
values -= np.mean(values[mask])
if norm_data:
values /= np.std(values[mask])

return return_hook, values

Expand Down Expand Up @@ -629,7 +643,8 @@ def _use_window(minp, window):
return minp


def _rolling_func(func, desc, check_minp=_use_window, how=None, additional_kw=''):
def _rolling_func(func, desc, check_minp=_use_window, how=None,
additional_kw='', center_data=False, norm_data=False):
if how is None:
how_arg_str = 'None'
else:
Expand All @@ -645,7 +660,8 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds):
minp = check_minp(minp, window)
return func(arg, window, minp, **kwds)
return _rolling_moment(arg, window, call_cython, min_periods, freq=freq,
center=center, how=how, **kwargs)
center=center, how=how, center_data=center_data,
norm_data=norm_data, **kwargs)

return f

Expand All @@ -657,16 +673,24 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds):
how='median')

_ts_std = lambda *a, **kw: _zsqrt(algos.roll_var(*a, **kw))
def _roll_skew(*args, **kwargs):
kwargs['kurt'] = False
return algos.roll_higher_moment(*args, **kwargs)
def _roll_kurt(*args, **kwargs):
kwargs['kurt'] = True
return algos.roll_higher_moment(*args, **kwargs)
rolling_std = _rolling_func(_ts_std, 'Moving standard deviation.',
check_minp=_require_min_periods(1),
additional_kw=_ddof_kw)
rolling_var = _rolling_func(algos.roll_var, 'Moving variance.',
check_minp=_require_min_periods(1),
additional_kw=_ddof_kw)
rolling_skew = _rolling_func(algos.roll_skew, 'Unbiased moving skewness.',
check_minp=_require_min_periods(3))
rolling_kurt = _rolling_func(algos.roll_kurt, 'Unbiased moving kurtosis.',
check_minp=_require_min_periods(4))
rolling_skew = _rolling_func(_roll_skew, 'Unbiased moving skewness.',
check_minp=_require_min_periods(3),
center_data=True, norm_data=False)
rolling_kurt = _rolling_func(_roll_kurt, 'Unbiased moving kurtosis.',
check_minp=_require_min_periods(4),
center_data=True, norm_data=True)


def rolling_quantile(arg, window, quantile, min_periods=None, freq=None,
Expand Down Expand Up @@ -903,9 +927,9 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds):
expanding_var = _expanding_func(algos.roll_var, 'Expanding variance.',
check_minp=_require_min_periods(1),
additional_kw=_ddof_kw)
expanding_skew = _expanding_func(algos.roll_skew, 'Unbiased expanding skewness.',
expanding_skew = _expanding_func(_roll_skew, 'Unbiased expanding skewness.',
check_minp=_require_min_periods(3))
expanding_kurt = _expanding_func(algos.roll_kurt, 'Unbiased expanding kurtosis.',
expanding_kurt = _expanding_func(_roll_kurt, 'Unbiased expanding kurtosis.',
check_minp=_require_min_periods(4))


Expand Down
Loading

0 comments on commit d68300a

Please sign in to comment.