Merge pull request #3731 from cpcloud/raise-on-datetime-ufuncs-3726

API: raise TypeError on most datetime64 reduction ops
pandas-dev · Jun 6, 2013 · 6dbcc83 · 6dbcc83
2 parents 7a219e7 + 1b94cfb
commit 6dbcc83
Show file tree

Hide file tree

Showing 6 changed files with 142 additions and 64 deletions.
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -97,6 +97,12 @@ pandas 0.11.1
     in your calls.
   - Do not allow astypes on ``datetime64[ns]`` except to ``object``, and
     ``timedelta64[ns]`` to ``object/int`` (GH3425_)
+  - The behavior of ``datetime64`` dtypes has changed with respect to certain
+    so-called reduction operations (GH3726_). The following operations now
+    raise a ``TypeError`` when perfomed on a ``Series`` and return an *empty*
+    ``Series`` when performed on a ``DataFrame`` similar to performing these
+    operations on, for example, a ``DataFrame`` of ``slice`` objects:
+    - sum, prod, mean, std, var, skew, kurt, corr, and cov
   - Do not allow datetimelike/timedeltalike creation except with valid types
     (e.g. cannot pass ``datetime64[ms]``) (GH3423_)
   - Add ``squeeze`` keyword to ``groupby`` to allow reduction from
@@ -294,6 +300,7 @@ pandas 0.11.1
 .. _GH3748: https://github.com/pydata/pandas/issues/3748
 .. _GH3741: https://github.com/pydata/pandas/issues/3741
 .. _GH3750: https://github.com/pydata/pandas/issues/3750
+.. _GH3726: https://github.com/pydata/pandas/issues/3726
 
 pandas 0.11.0
 =============

diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt
@@ -128,6 +128,17 @@ API changes
   - ``DataFrame.to_html`` and ``DataFrame.to_latex`` now accept a path for
     their first argument (GH3702_)
 
+  - Do not allow astypes on ``datetime64[ns]`` except to ``object``, and
+    ``timedelta64[ns]`` to ``object/int`` (GH3425_)
+
+  - The behavior of ``datetime64`` dtypes has changed with respect to certain
+    so-called reduction operations (GH3726_). The following operations now
+    raise a ``TypeError`` when perfomed on a ``Series`` and return an *empty*
+    ``Series`` when performed on a ``DataFrame`` similar to performing these
+    operations on, for example, a ``DataFrame`` of ``slice`` objects:
+
+    - sum, prod, mean, std, var, skew, kurt, corr, and cov
+
 Enhancements
 ~~~~~~~~~~~~
 
@@ -345,3 +356,5 @@ on GitHub for a complete list.
 .. _GH3696: https://github.com/pydata/pandas/issues/3696
 .. _GH3667: https://github.com/pydata/pandas/issues/3667
 .. _GH3741: https://github.com/pydata/pandas/issues/3741
+.. _GH3726: https://github.com/pydata/pandas/issues/3726
+.. _GH3425: https://github.com/pydata/pandas/issues/3425
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -1,10 +1,11 @@
 import sys
+import itertools
+import functools
 
 import numpy as np
 
 from pandas.core.common import isnull, notnull
 import pandas.core.common as com
-import pandas.core.config as cf
 import pandas.lib as lib
 import pandas.algos as algos
 import pandas.hashtable as _hash
@@ -17,41 +18,70 @@
     _USE_BOTTLENECK = False
 
 
-def _bottleneck_switch(bn_name, alt, zero_value=None, **kwargs):
-    try:
-        bn_func = getattr(bn, bn_name)
-    except (AttributeError, NameError):  # pragma: no cover
-        bn_func = None
+class disallow(object):
+    def __init__(self, *dtypes):
+        super(disallow, self).__init__()
+        self.dtypes = tuple(np.dtype(dtype).type for dtype in dtypes)
+
+    def check(self, obj):
+        return hasattr(obj, 'dtype') and issubclass(obj.dtype.type,
+                                                    self.dtypes)
+
+    def __call__(self, f):
+        @functools.wraps(f)
+        def _f(*args, **kwargs):
+            obj_iter = itertools.chain(args, kwargs.itervalues())
+            if any(self.check(obj) for obj in obj_iter):
+                raise TypeError('reduction operation {0!r} not allowed for '
+                                'this dtype'.format(f.__name__.replace('nan',
+                                                                       '')))
+            return f(*args, **kwargs)
+        return _f
+
+
+class bottleneck_switch(object):
+    def __init__(self, zero_value=None, **kwargs):
+        self.zero_value = zero_value
+        self.kwargs = kwargs
+
+    def __call__(self, alt):
+        bn_name = alt.__name__
 
-    def f(values, axis=None, skipna=True, **kwds):
-        if len(kwargs) > 0:
-            for k, v in kwargs.iteritems():
-                if k not in kwds:
-                    kwds[k] = v
         try:
-            if zero_value is not None and values.size == 0:
-                if values.ndim == 1:
-                    return 0
+            bn_func = getattr(bn, bn_name)
+        except (AttributeError, NameError):  # pragma: no cover
+            bn_func = None
+
+        @functools.wraps(alt)
+        def f(values, axis=None, skipna=True, **kwds):
+            if len(self.kwargs) > 0:
+                for k, v in self.kwargs.iteritems():
+                    if k not in kwds:
+                        kwds[k] = v
+            try:
+                if self.zero_value is not None and values.size == 0:
+                    if values.ndim == 1:
+                        return 0
+                    else:
+                        result_shape = (values.shape[:axis] +
+                                        values.shape[axis + 1:])
+                        result = np.empty(result_shape)
+                        result.fill(0)
+                        return result
+
+                if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype):
+                    result = bn_func(values, axis=axis, **kwds)
+                    # prefer to treat inf/-inf as NA
+                    if _has_infs(result):
+                        result = alt(values, axis=axis, skipna=skipna, **kwds)
                 else:
-                    result_shape = values.shape[:
-                                                axis] + values.shape[axis + 1:]
-                    result = np.empty(result_shape)
-                    result.fill(0)
-                    return result
-
-            if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype):
-                result = bn_func(values, axis=axis, **kwds)
-                # prefer to treat inf/-inf as NA
-                if _has_infs(result):
                     result = alt(values, axis=axis, skipna=skipna, **kwds)
-            else:
+            except Exception:
                 result = alt(values, axis=axis, skipna=skipna, **kwds)
-        except Exception:
-            result = alt(values, axis=axis, skipna=skipna, **kwds)
 
-        return result
+            return result
 
-    return f
+        return f
 
 
 def _bn_ok_dtype(dt):
@@ -166,13 +196,17 @@ def nanall(values, axis=None, skipna=True):
     values, mask, dtype = _get_values(values, skipna, True, copy=skipna)
     return values.all(axis)
 
-def _nansum(values, axis=None, skipna=True):
+@disallow('M8')
+@bottleneck_switch(zero_value=0)
+def nansum(values, axis=None, skipna=True):
     values, mask, dtype = _get_values(values, skipna, 0)
     the_sum = values.sum(axis)
     the_sum = _maybe_null_out(the_sum, axis, mask)
     return the_sum
 
-def _nanmean(values, axis=None, skipna=True):
+@disallow('M8')
+@bottleneck_switch()
+def nanmean(values, axis=None, skipna=True):
     values, mask, dtype = _get_values(values, skipna, 0)
     the_sum = _ensure_numeric(values.sum(axis))
     count = _get_counts(mask, axis)
@@ -186,8 +220,9 @@ def _nanmean(values, axis=None, skipna=True):
         the_mean = the_sum / count if count > 0 else np.nan
     return the_mean
 
-
-def _nanmedian(values, axis=None, skipna=True):
+@disallow('M8')
+@bottleneck_switch()
+def nanmedian(values, axis=None, skipna=True):
     def get_median(x):
         mask = notnull(x)
         if not skipna and not mask.all():
@@ -197,13 +232,31 @@ def get_median(x):
     if values.dtype != np.float64:
         values = values.astype('f8')
 
-    if values.ndim > 1:
-        return np.apply_along_axis(get_median, axis, values)
-    else:
-        return get_median(values)
+    notempty = values.size
 
-
-def _nanvar(values, axis=None, skipna=True, ddof=1):
+    # an array from a frame
+    if values.ndim > 1:
+        # there's a non-empty array to apply over otherwise numpy raises
+        if notempty:
+            return np.apply_along_axis(get_median, axis, values)
+
+        # must return the correct shape, but median is not defined for the
+        # empty set so return nans of shape "everything but the passed axis"
+        # since "axis" is where the reduction would occur if we had a nonempty
+        # array
+        shp = np.array(values.shape)
+        dims = np.arange(values.ndim)
+        ret = np.empty(shp[dims != axis])
+        ret.fill(np.nan)
+        return ret
+
+    # otherwise return a scalar value
+    return get_median(values) if notempty else np.nan
+
+
+@disallow('M8')
+@bottleneck_switch(ddof=1)
+def nanvar(values, axis=None, skipna=True, ddof=1):
     if not isinstance(values.dtype.type, np.floating):
         values = values.astype('f8')
 
@@ -223,7 +276,8 @@ def _nanvar(values, axis=None, skipna=True, ddof=1):
     return np.fabs((XX - X ** 2 / count) / (count - ddof))
 
 
-def _nanmin(values, axis=None, skipna=True):
+@bottleneck_switch()
+def nanmin(values, axis=None, skipna=True):
     values, mask, dtype = _get_values(values, skipna, fill_value_typ = '+inf')
 
     # numpy 1.6.1 workaround in Python 3.x
@@ -247,7 +301,8 @@ def _nanmin(values, axis=None, skipna=True):
     return _maybe_null_out(result, axis, mask)
 
 
-def _nanmax(values, axis=None, skipna=True):
+@bottleneck_switch()
+def nanmax(values, axis=None, skipna=True):
     values, mask, dtype = _get_values(values, skipna, fill_value_typ ='-inf')
 
     # numpy 1.6.1 workaround in Python 3.x
@@ -291,14 +346,8 @@ def nanargmin(values, axis=None, skipna=True):
     result = _maybe_arg_null_out(result, axis, mask, skipna)
     return result
 
-nansum = _bottleneck_switch('nansum', _nansum, zero_value=0)
-nanmean = _bottleneck_switch('nanmean', _nanmean)
-nanmedian = _bottleneck_switch('nanmedian', _nanmedian)
-nanvar = _bottleneck_switch('nanvar', _nanvar, ddof=1)
-nanmin = _bottleneck_switch('nanmin', _nanmin)
-nanmax = _bottleneck_switch('nanmax', _nanmax)
-
 
+@disallow('M8')
 def nanskew(values, axis=None, skipna=True):
     if not isinstance(values.dtype.type, np.floating):
         values = values.astype('f8')
@@ -332,6 +381,7 @@ def nanskew(values, axis=None, skipna=True):
         return result
 
 
+@disallow('M8')
 def nankurt(values, axis=None, skipna=True):
     if not isinstance(values.dtype.type, np.floating):
         values = values.astype('f8')
@@ -365,6 +415,7 @@ def nankurt(values, axis=None, skipna=True):
         return result
 
 
+@disallow('M8')
 def nanprod(values, axis=None, skipna=True):
     mask = isnull(values)
     if skipna and not issubclass(values.dtype.type, np.integer):
@@ -423,6 +474,7 @@ def _zero_out_fperr(arg):
         return 0 if np.abs(arg) < 1e-14 else arg
 
 
+@disallow('M8')
 def nancorr(a, b, method='pearson', min_periods=None):
     """
     a, b: ndarrays
@@ -469,6 +521,7 @@ def _spearman(a, b):
     return _cor_methods[method]
 
 
+@disallow('M8')
 def nancov(a, b, min_periods=None):
     if len(a) != len(b):
         raise AssertionError('Operands to nancov must have same size')

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -97,21 +97,15 @@ def convert_to_array(values):
                     values = np.array([values])
                 inferred_type = lib.infer_dtype(values)
                 if inferred_type in set(['datetime64','datetime','date','time']):
-                    if isinstance(values, pa.Array) and com.is_datetime64_dtype(values):
-                        pass
-                    else:
+                    if not (isinstance(values, pa.Array) and com.is_datetime64_dtype(values)):
                         values = tslib.array_to_datetime(values)
                 elif inferred_type in set(['timedelta','timedelta64']):
                     # need to convert timedelta to ns here
                     # safest to convert it to an object arrany to process
-                    if isinstance(values, pa.Array) and com.is_timedelta64_dtype(values):
-                        pass
-                    else:
+                    if not (isinstance(values, pa.Array) and com.is_timedelta64_dtype(values)):
                         values = com._possibly_cast_to_timedelta(values)
                 elif inferred_type in set(['integer']):
-                    if values.dtype == 'timedelta64[ns]':
-                        pass
-                    elif values.dtype.kind == 'm':
+                    if values.dtype.kind == 'm':
                         values = values.astype('timedelta64[ns]')
                 else:
                     values = pa.array(values)
@@ -125,9 +119,9 @@ def convert_to_array(values):
             is_datetime_rhs  = com.is_datetime64_dtype(rvalues)
 
             # 2 datetimes or 2 timedeltas
-            if (is_timedelta_lhs and is_timedelta_rhs) or (is_datetime_lhs and is_datetime_rhs):
-
-                if is_datetime_lhs and name not in ['__sub__']:
+            if (is_timedelta_lhs and is_timedelta_rhs) or (is_datetime_lhs and
+                    is_datetime_rhs):
+                if is_datetime_lhs and name != '__sub__':
                     raise TypeError("can only operate on a datetimes for subtraction, "
                                     "but the operator [%s] was passed" % name)
                 elif is_timedelta_lhs and name not in ['__add__','__sub__']:

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -9167,6 +9167,15 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
 
         f = getattr(frame, name)
 
+        if not ('max' in name or 'min' in name or 'count' in name):
+            df = DataFrame({'b': date_range('1/1/2001', periods=2)})
+            _f = getattr(df, name)
+            print df
+            self.assertFalse(len(_f()))
+
+            df['a'] = range(len(df))
+            self.assert_(len(getattr(df, name)()))
+
         if has_skipna:
             def skipna_wrapper(x):
                 nona = x.dropna().values

diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -1460,10 +1460,6 @@ def test_sum_inf(self):
         with cf.option_context("mode.use_inf_as_null", True):
             assert_almost_equal(s.sum(), s2.sum())
 
-            res = nanops.nansum(arr, axis=1)
-            expected = nanops._nansum(arr, axis=1)
-            assert_almost_equal(res, expected)
-
         res = nanops.nansum(arr, axis=1)
         self.assertTrue(np.isinf(res).all())
 
@@ -1594,6 +1590,12 @@ def testit():
             # add some NaNs
             self.series[5:15] = np.NaN
 
+
+            # idxmax, idxmin, min, and max are valid for dates
+            if not ('max' in name or 'min' in name):
+                ds = Series(date_range('1/1/2001', periods=10))
+                self.assertRaises(TypeError, f, ds)
+
             # skipna or no
             self.assert_(notnull(f(self.series)))
             self.assert_(isnull(f(self.series, skipna=False)))