ENH: numerically stable roll_skew and roll_kurt

pandas-dev · Nov 18, 2015 · d68300a · d68300a
1 parent d6a7700
commit d68300a
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 140 deletions.
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -1331,144 +1331,105 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
 
     return output
 
+#----------------------------------------------------------------------
+# Rolling skewness and kurtosis
 
-#-------------------------------------------------------------------------------
-# Rolling skewness
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def roll_skew(ndarray[double_t] input, int win, int minp):
+def roll_higher_moment(ndarray[double_t] input, int win, int minp, bint kurt):
+    """
+    Numerically stable implementation of skewness and kurtosis using a
+    Welford-like method. If `kurt` is True, rolling kurtosis is computed,
+    if False, rolling skewness.
+    """
     cdef double val, prev
-    cdef double x = 0, xx = 0, xxx = 0
-    cdef Py_ssize_t nobs = 0, i
-    cdef Py_ssize_t N = len(input)
+    cdef double mean_x = 0, s2dm_x = 0, s3dm_x = 0, s4dm_x = 0, rep = NaN
+    cdef double delta, delta_n, tmp
+    cdef Py_ssize_t i, nobs = 0, nrep = 0, N = len(input)
 
     cdef ndarray[double_t] output = np.empty(N, dtype=float)
 
-    # 3 components of the skewness equation
-    cdef double A, B, C, R
-
     minp = _check_minp(win, minp, N)
-    with nogil:
-        for i from 0 <= i < minp - 1:
-            val = input[i]
+    minobs = max(minp, 4 if kurt else 3)
 
-            # Not NaN
-            if val == val:
-                nobs += 1
-                x += val
-                xx += val * val
-                xxx += val * val * val
-
-            output[i] = NaN
-
-        for i from minp - 1 <= i < N:
-            val = input[i]
-
-            if val == val:
-                nobs += 1
-                x += val
-                xx += val * val
-                xxx += val * val * val
-
-            if i > win - 1:
-                prev = input[i - win]
-                if prev == prev:
-                    x -= prev
-                    xx -= prev * prev
-                    xxx -= prev * prev * prev
-
-                    nobs -= 1
-            if nobs >= minp:
-                A = x / nobs
-                B = xx / nobs - A * A
-                C = xxx / nobs - A * A * A - 3 * A * B
-                if B <= 0 or nobs < 3:
-                    output[i] = NaN
+    for i from 0 <= i < N:
+        val = input[i]
+        prev = NaN if i < win else input[i - win]
+
+        if prev == prev:
+            # prev is not NaN, remove an observation...
+            nobs -= 1
+            if nobs < nrep:
+                # ...all non-NaN values were identical, remove a repeat
+                nrep -= 1
+            if nobs == nrep:
+                # We can get here both if all non-NaN were already identical
+                # or if nobs == 1 after removing the observation
+                if nrep == 0:
+                    rep = NaN
+                    mean_x = 0
                 else:
-                    R = sqrt(B)
-                    output[i] = ((sqrt(nobs * (nobs - 1.)) * C) /
-                                 ((nobs-2) * R * R * R))
+                    mean_x = rep
+                # This is redundant most of the time
+                s2dm_x = s3dm_x = s4dm_x = 0
             else:
-                output[i] = NaN
-
-    return output
-
-#-------------------------------------------------------------------------------
-# Rolling kurtosis
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def roll_kurt(ndarray[double_t] input,
-               int win, int minp):
-    cdef double val, prev
-    cdef double x = 0, xx = 0, xxx = 0, xxxx = 0
-    cdef Py_ssize_t nobs = 0, i
-    cdef Py_ssize_t N = len(input)
-
-    cdef ndarray[double_t] output = np.empty(N, dtype=float)
-
-    # 5 components of the kurtosis equation
-    cdef double A, B, C, D, R, K
-
-    minp = _check_minp(win, minp, N)
-    with nogil:
-        for i from 0 <= i < minp - 1:
-            val = input[i]
-
-            # Not NaN
-            if val == val:
-                nobs += 1
-
-                # seriously don't ask me why this is faster
-                x += val
-                xx += val * val
-                xxx += val * val * val
-                xxxx += val * val * val * val
-
-            output[i] = NaN
-
-        for i from minp - 1 <= i < N:
-            val = input[i]
-
-            if val == val:
-                nobs += 1
-                x += val
-                xx += val * val
-                xxx += val * val * val
-                xxxx += val * val * val * val
-
-            if i > win - 1:
-                prev = input[i - win]
-                if prev == prev:
-                    x -= prev
-                    xx -= prev * prev
-                    xxx -= prev * prev * prev
-                    xxxx -= prev * prev * prev * prev
-
-                    nobs -= 1
-
-            if nobs >= minp:
-                A = x / nobs
-                R = A * A
-                B = xx / nobs - R
-                R = R * A
-                C = xxx / nobs - R - 3 * A * B
-                R = R * A
-                D = xxxx / nobs - R - 6*B*A*A - 4*C*A
-
-                if B == 0 or nobs < 4:
-                    output[i] = NaN
-
-                else:
-                    K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2)
-                    K = K / ((nobs - 2.)*(nobs-3.))
-
-                    output[i] = K
+                # ...update mean and sums of raised differences from mean
+                delta = prev - mean_x
+                delta_n = delta / nobs
+                tmp = delta * delta_n * (nobs + 1)
+                if kurt:
+                    s4dm_x -= ((tmp * ((nobs + 3) * nobs + 3) -
+                                6 * s2dm_x) * delta_n - 4 * s3dm_x) * delta_n
+                s3dm_x -= (tmp * (nobs + 2) - 3 * s2dm_x) * delta_n
+                s2dm_x -= tmp
+                mean_x -= delta_n
 
+        if val == val:
+            # val is not NaN, adding an observation...
+            nobs += 1
+            if val == rep:
+                # ...and adding a repeat
+                nrep += 1
             else:
-                output[i] = NaN
+                # ...and resetting repeats
+                nrep = 1
+                rep = val
+            if nobs == nrep:
+                # ...all non-NaN values are identical
+                mean_x = rep
+                s2dm_x = s3dm_x = s4dm_x = 0
+            else:
+                # ...update mean and sums of raised differences from mean
+                delta = val - mean_x
+                delta_n = delta / nobs
+                tmp = delta * delta_n * (nobs - 1)
+                if kurt:
+                    s4dm_x += ((tmp * ((nobs - 3) * nobs + 3) +
+                                6 * s2dm_x) * delta_n - 4 * s3dm_x) * delta_n
+                s3dm_x += (tmp * (nobs - 2) - 3 * s2dm_x) * delta_n
+                s2dm_x += tmp
+                mean_x += delta_n
+
+        # Sums of even powers must be positive
+        if s2dm_x < 0 or s4dm_x < 0:
+            s2dm_x = s3dm_x = s4_dm_x = 0
+
+        if nobs < minobs or s2dm_x == 0:
+            output[i] = NaN
+        elif kurt:
+            # multiplications are cheap, divisions are not
+            tmp = s2dm_x * s2dm_x
+            output[i] = (nobs - 1) * (nobs * (nobs + 1) * s4dm_x -
+                                      3 * (nobs - 1) * tmp)
+            output[i] /= tmp * (nobs - 2) * (nobs - 3)
+        else:
+            # multiplications are cheap, divisions and square roots are not
+            tmp = (nobs - 2) * (nobs - 2) * s2dm_x * s2dm_x * s2dm_x
+            output[i] = s3dm_x * nobs * sqrt((nobs - 1) / tmp)
 
     return output
 
+
 #-------------------------------------------------------------------------------
 # Rolling median, min, max
 

diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py
@@ -355,7 +355,8 @@ def rolling_corr_pairwise(df1, df2=None, window=None, min_periods=None,
 
 
 def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False,
-                    how=None, args=(), kwargs={}, **kwds):
+                    how=None, args=(), kwargs={}, center_data=False,
+                    norm_data=False, **kwds):
     """
     Rolling statistical measure using supplied function. Designed to be
     used with passed-in Cython array-based functions.
@@ -378,15 +379,21 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False,
         Passed on to func
     kwargs : dict
         Passed on to func
+    center_data : bool
+        If True, subtract the mean of the data from the values
+    norm_data: bool
+        If True, subtract the mean of the data from the values, and divide
+        by their standard deviation.
 
     Returns
     -------
     y : type of input
     """
     arg = _conv_timerule(arg, freq, how)
 
-    return_hook, values = _process_data_structure(arg)
-
+    return_hook, values = _process_data_structure(arg,
+                                                  center_data=center_data,
+                                                  norm_data=norm_data)
     if values.size == 0:
         result = values.copy()
     else:
@@ -423,7 +430,8 @@ def _center_window(rs, window, axis):
     return rs
 
 
-def _process_data_structure(arg, kill_inf=True):
+def _process_data_structure(arg, kill_inf=True, center_data=False,
+                            norm_data=False):
     if isinstance(arg, DataFrame):
         return_hook = lambda v: type(arg)(v, index=arg.index,
                                           columns=arg.columns)
@@ -438,9 +446,15 @@ def _process_data_structure(arg, kill_inf=True):
     if not issubclass(values.dtype.type, float):
         values = values.astype(float)
 
-    if kill_inf:
+    if kill_inf or center_data or norm_data:
         values = values.copy()
-        values[np.isinf(values)] = np.NaN
+        mask = np.isfinite(values)
+        if kill_inf:
+            values[~mask] = np.NaN
+        if center_data or norm_data:
+            values -= np.mean(values[mask])
+        if norm_data:
+            values /= np.std(values[mask])
 
     return return_hook, values
 
@@ -629,7 +643,8 @@ def _use_window(minp, window):
         return minp
 
 
-def _rolling_func(func, desc, check_minp=_use_window, how=None, additional_kw=''):
+def _rolling_func(func, desc, check_minp=_use_window, how=None,
+                  additional_kw='', center_data=False, norm_data=False):
     if how is None:
         how_arg_str = 'None'
     else:
@@ -645,7 +660,8 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds):
             minp = check_minp(minp, window)
             return func(arg, window, minp, **kwds)
         return _rolling_moment(arg, window, call_cython, min_periods, freq=freq,
-                               center=center, how=how, **kwargs)
+                               center=center, how=how, center_data=center_data,
+                               norm_data=norm_data, **kwargs)
 
     return f
 
@@ -657,16 +673,24 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds):
                                how='median')
 
 _ts_std = lambda *a, **kw: _zsqrt(algos.roll_var(*a, **kw))
+def _roll_skew(*args, **kwargs):
+    kwargs['kurt'] = False
+    return algos.roll_higher_moment(*args, **kwargs)
+def _roll_kurt(*args, **kwargs):
+    kwargs['kurt'] = True
+    return algos.roll_higher_moment(*args, **kwargs)
 rolling_std = _rolling_func(_ts_std, 'Moving standard deviation.',
                             check_minp=_require_min_periods(1),
                             additional_kw=_ddof_kw)
 rolling_var = _rolling_func(algos.roll_var, 'Moving variance.',
                             check_minp=_require_min_periods(1),
                             additional_kw=_ddof_kw)
-rolling_skew = _rolling_func(algos.roll_skew, 'Unbiased moving skewness.',
-                             check_minp=_require_min_periods(3))
-rolling_kurt = _rolling_func(algos.roll_kurt, 'Unbiased moving kurtosis.',
-                             check_minp=_require_min_periods(4))
+rolling_skew = _rolling_func(_roll_skew, 'Unbiased moving skewness.',
+                             check_minp=_require_min_periods(3),
+                             center_data=True, norm_data=False)
+rolling_kurt = _rolling_func(_roll_kurt, 'Unbiased moving kurtosis.',
+                             check_minp=_require_min_periods(4),
+                             center_data=True, norm_data=True)
 
 
 def rolling_quantile(arg, window, quantile, min_periods=None, freq=None,
@@ -903,9 +927,9 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds):
 expanding_var = _expanding_func(algos.roll_var, 'Expanding variance.',
                                 check_minp=_require_min_periods(1),
                                 additional_kw=_ddof_kw)
-expanding_skew = _expanding_func(algos.roll_skew, 'Unbiased expanding skewness.',
+expanding_skew = _expanding_func(_roll_skew, 'Unbiased expanding skewness.',
                                  check_minp=_require_min_periods(3))
-expanding_kurt = _expanding_func(algos.roll_kurt, 'Unbiased expanding kurtosis.',
+expanding_kurt = _expanding_func(_roll_kurt, 'Unbiased expanding kurtosis.',
                                  check_minp=_require_min_periods(4))