Skip to content

Commit

Permalink
Merge pull request #3731 from cpcloud/raise-on-datetime-ufuncs-3726
Browse files Browse the repository at this point in the history
API: raise TypeError on most datetime64 reduction ops
  • Loading branch information
jreback committed Jun 6, 2013
2 parents 7a219e7 + 1b94cfb commit 6dbcc83
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 64 deletions.
7 changes: 7 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@ pandas 0.11.1
in your calls.
- Do not allow astypes on ``datetime64[ns]`` except to ``object``, and
``timedelta64[ns]`` to ``object/int`` (GH3425_)
- The behavior of ``datetime64`` dtypes has changed with respect to certain
so-called reduction operations (GH3726_). The following operations now
raise a ``TypeError`` when perfomed on a ``Series`` and return an *empty*
``Series`` when performed on a ``DataFrame`` similar to performing these
operations on, for example, a ``DataFrame`` of ``slice`` objects:
- sum, prod, mean, std, var, skew, kurt, corr, and cov
- Do not allow datetimelike/timedeltalike creation except with valid types
(e.g. cannot pass ``datetime64[ms]``) (GH3423_)
- Add ``squeeze`` keyword to ``groupby`` to allow reduction from
Expand Down Expand Up @@ -294,6 +300,7 @@ pandas 0.11.1
.. _GH3748: https://github.com/pydata/pandas/issues/3748
.. _GH3741: https://github.com/pydata/pandas/issues/3741
.. _GH3750: https://github.com/pydata/pandas/issues/3750
.. _GH3726: https://github.com/pydata/pandas/issues/3726

pandas 0.11.0
=============
Expand Down
13 changes: 13 additions & 0 deletions doc/source/v0.11.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,17 @@ API changes
- ``DataFrame.to_html`` and ``DataFrame.to_latex`` now accept a path for
their first argument (GH3702_)

- Do not allow astypes on ``datetime64[ns]`` except to ``object``, and
``timedelta64[ns]`` to ``object/int`` (GH3425_)

- The behavior of ``datetime64`` dtypes has changed with respect to certain
so-called reduction operations (GH3726_). The following operations now
raise a ``TypeError`` when perfomed on a ``Series`` and return an *empty*
``Series`` when performed on a ``DataFrame`` similar to performing these
operations on, for example, a ``DataFrame`` of ``slice`` objects:

- sum, prod, mean, std, var, skew, kurt, corr, and cov

Enhancements
~~~~~~~~~~~~

Expand Down Expand Up @@ -345,3 +356,5 @@ on GitHub for a complete list.
.. _GH3696: https://github.com/pydata/pandas/issues/3696
.. _GH3667: https://github.com/pydata/pandas/issues/3667
.. _GH3741: https://github.com/pydata/pandas/issues/3741
.. _GH3726: https://github.com/pydata/pandas/issues/3726
.. _GH3425: https://github.com/pydata/pandas/issues/3425
149 changes: 101 additions & 48 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import sys
import itertools
import functools

import numpy as np

from pandas.core.common import isnull, notnull
import pandas.core.common as com
import pandas.core.config as cf
import pandas.lib as lib
import pandas.algos as algos
import pandas.hashtable as _hash
Expand All @@ -17,41 +18,70 @@
_USE_BOTTLENECK = False


def _bottleneck_switch(bn_name, alt, zero_value=None, **kwargs):
try:
bn_func = getattr(bn, bn_name)
except (AttributeError, NameError): # pragma: no cover
bn_func = None
class disallow(object):
def __init__(self, *dtypes):
super(disallow, self).__init__()
self.dtypes = tuple(np.dtype(dtype).type for dtype in dtypes)

def check(self, obj):
return hasattr(obj, 'dtype') and issubclass(obj.dtype.type,
self.dtypes)

def __call__(self, f):
@functools.wraps(f)
def _f(*args, **kwargs):
obj_iter = itertools.chain(args, kwargs.itervalues())
if any(self.check(obj) for obj in obj_iter):
raise TypeError('reduction operation {0!r} not allowed for '
'this dtype'.format(f.__name__.replace('nan',
'')))
return f(*args, **kwargs)
return _f


class bottleneck_switch(object):
def __init__(self, zero_value=None, **kwargs):
self.zero_value = zero_value
self.kwargs = kwargs

def __call__(self, alt):
bn_name = alt.__name__

def f(values, axis=None, skipna=True, **kwds):
if len(kwargs) > 0:
for k, v in kwargs.iteritems():
if k not in kwds:
kwds[k] = v
try:
if zero_value is not None and values.size == 0:
if values.ndim == 1:
return 0
bn_func = getattr(bn, bn_name)
except (AttributeError, NameError): # pragma: no cover
bn_func = None

@functools.wraps(alt)
def f(values, axis=None, skipna=True, **kwds):
if len(self.kwargs) > 0:
for k, v in self.kwargs.iteritems():
if k not in kwds:
kwds[k] = v
try:
if self.zero_value is not None and values.size == 0:
if values.ndim == 1:
return 0
else:
result_shape = (values.shape[:axis] +
values.shape[axis + 1:])
result = np.empty(result_shape)
result.fill(0)
return result

if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype):
result = bn_func(values, axis=axis, **kwds)
# prefer to treat inf/-inf as NA
if _has_infs(result):
result = alt(values, axis=axis, skipna=skipna, **kwds)
else:
result_shape = values.shape[:
axis] + values.shape[axis + 1:]
result = np.empty(result_shape)
result.fill(0)
return result

if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype):
result = bn_func(values, axis=axis, **kwds)
# prefer to treat inf/-inf as NA
if _has_infs(result):
result = alt(values, axis=axis, skipna=skipna, **kwds)
else:
except Exception:
result = alt(values, axis=axis, skipna=skipna, **kwds)
except Exception:
result = alt(values, axis=axis, skipna=skipna, **kwds)

return result
return result

return f
return f


def _bn_ok_dtype(dt):
Expand Down Expand Up @@ -166,13 +196,17 @@ def nanall(values, axis=None, skipna=True):
values, mask, dtype = _get_values(values, skipna, True, copy=skipna)
return values.all(axis)

def _nansum(values, axis=None, skipna=True):
@disallow('M8')
@bottleneck_switch(zero_value=0)
def nansum(values, axis=None, skipna=True):
values, mask, dtype = _get_values(values, skipna, 0)
the_sum = values.sum(axis)
the_sum = _maybe_null_out(the_sum, axis, mask)
return the_sum

def _nanmean(values, axis=None, skipna=True):
@disallow('M8')
@bottleneck_switch()
def nanmean(values, axis=None, skipna=True):
values, mask, dtype = _get_values(values, skipna, 0)
the_sum = _ensure_numeric(values.sum(axis))
count = _get_counts(mask, axis)
Expand All @@ -186,8 +220,9 @@ def _nanmean(values, axis=None, skipna=True):
the_mean = the_sum / count if count > 0 else np.nan
return the_mean


def _nanmedian(values, axis=None, skipna=True):
@disallow('M8')
@bottleneck_switch()
def nanmedian(values, axis=None, skipna=True):
def get_median(x):
mask = notnull(x)
if not skipna and not mask.all():
Expand All @@ -197,13 +232,31 @@ def get_median(x):
if values.dtype != np.float64:
values = values.astype('f8')

if values.ndim > 1:
return np.apply_along_axis(get_median, axis, values)
else:
return get_median(values)
notempty = values.size


def _nanvar(values, axis=None, skipna=True, ddof=1):
# an array from a frame
if values.ndim > 1:
# there's a non-empty array to apply over otherwise numpy raises
if notempty:
return np.apply_along_axis(get_median, axis, values)

# must return the correct shape, but median is not defined for the
# empty set so return nans of shape "everything but the passed axis"
# since "axis" is where the reduction would occur if we had a nonempty
# array
shp = np.array(values.shape)
dims = np.arange(values.ndim)
ret = np.empty(shp[dims != axis])
ret.fill(np.nan)
return ret

# otherwise return a scalar value
return get_median(values) if notempty else np.nan


@disallow('M8')
@bottleneck_switch(ddof=1)
def nanvar(values, axis=None, skipna=True, ddof=1):
if not isinstance(values.dtype.type, np.floating):
values = values.astype('f8')

Expand All @@ -223,7 +276,8 @@ def _nanvar(values, axis=None, skipna=True, ddof=1):
return np.fabs((XX - X ** 2 / count) / (count - ddof))


def _nanmin(values, axis=None, skipna=True):
@bottleneck_switch()
def nanmin(values, axis=None, skipna=True):
values, mask, dtype = _get_values(values, skipna, fill_value_typ = '+inf')

# numpy 1.6.1 workaround in Python 3.x
Expand All @@ -247,7 +301,8 @@ def _nanmin(values, axis=None, skipna=True):
return _maybe_null_out(result, axis, mask)


def _nanmax(values, axis=None, skipna=True):
@bottleneck_switch()
def nanmax(values, axis=None, skipna=True):
values, mask, dtype = _get_values(values, skipna, fill_value_typ ='-inf')

# numpy 1.6.1 workaround in Python 3.x
Expand Down Expand Up @@ -291,14 +346,8 @@ def nanargmin(values, axis=None, skipna=True):
result = _maybe_arg_null_out(result, axis, mask, skipna)
return result

nansum = _bottleneck_switch('nansum', _nansum, zero_value=0)
nanmean = _bottleneck_switch('nanmean', _nanmean)
nanmedian = _bottleneck_switch('nanmedian', _nanmedian)
nanvar = _bottleneck_switch('nanvar', _nanvar, ddof=1)
nanmin = _bottleneck_switch('nanmin', _nanmin)
nanmax = _bottleneck_switch('nanmax', _nanmax)


@disallow('M8')
def nanskew(values, axis=None, skipna=True):
if not isinstance(values.dtype.type, np.floating):
values = values.astype('f8')
Expand Down Expand Up @@ -332,6 +381,7 @@ def nanskew(values, axis=None, skipna=True):
return result


@disallow('M8')
def nankurt(values, axis=None, skipna=True):
if not isinstance(values.dtype.type, np.floating):
values = values.astype('f8')
Expand Down Expand Up @@ -365,6 +415,7 @@ def nankurt(values, axis=None, skipna=True):
return result


@disallow('M8')
def nanprod(values, axis=None, skipna=True):
mask = isnull(values)
if skipna and not issubclass(values.dtype.type, np.integer):
Expand Down Expand Up @@ -423,6 +474,7 @@ def _zero_out_fperr(arg):
return 0 if np.abs(arg) < 1e-14 else arg


@disallow('M8')
def nancorr(a, b, method='pearson', min_periods=None):
"""
a, b: ndarrays
Expand Down Expand Up @@ -469,6 +521,7 @@ def _spearman(a, b):
return _cor_methods[method]


@disallow('M8')
def nancov(a, b, min_periods=None):
if len(a) != len(b):
raise AssertionError('Operands to nancov must have same size')
Expand Down
18 changes: 6 additions & 12 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,21 +97,15 @@ def convert_to_array(values):
values = np.array([values])
inferred_type = lib.infer_dtype(values)
if inferred_type in set(['datetime64','datetime','date','time']):
if isinstance(values, pa.Array) and com.is_datetime64_dtype(values):
pass
else:
if not (isinstance(values, pa.Array) and com.is_datetime64_dtype(values)):
values = tslib.array_to_datetime(values)
elif inferred_type in set(['timedelta','timedelta64']):
# need to convert timedelta to ns here
# safest to convert it to an object arrany to process
if isinstance(values, pa.Array) and com.is_timedelta64_dtype(values):
pass
else:
if not (isinstance(values, pa.Array) and com.is_timedelta64_dtype(values)):
values = com._possibly_cast_to_timedelta(values)
elif inferred_type in set(['integer']):
if values.dtype == 'timedelta64[ns]':
pass
elif values.dtype.kind == 'm':
if values.dtype.kind == 'm':
values = values.astype('timedelta64[ns]')
else:
values = pa.array(values)
Expand All @@ -125,9 +119,9 @@ def convert_to_array(values):
is_datetime_rhs = com.is_datetime64_dtype(rvalues)

# 2 datetimes or 2 timedeltas
if (is_timedelta_lhs and is_timedelta_rhs) or (is_datetime_lhs and is_datetime_rhs):

if is_datetime_lhs and name not in ['__sub__']:
if (is_timedelta_lhs and is_timedelta_rhs) or (is_datetime_lhs and
is_datetime_rhs):
if is_datetime_lhs and name != '__sub__':
raise TypeError("can only operate on a datetimes for subtraction, "
"but the operator [%s] was passed" % name)
elif is_timedelta_lhs and name not in ['__add__','__sub__']:
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9167,6 +9167,15 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,

f = getattr(frame, name)

if not ('max' in name or 'min' in name or 'count' in name):
df = DataFrame({'b': date_range('1/1/2001', periods=2)})
_f = getattr(df, name)
print df
self.assertFalse(len(_f()))

df['a'] = range(len(df))
self.assert_(len(getattr(df, name)()))

if has_skipna:
def skipna_wrapper(x):
nona = x.dropna().values
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1460,10 +1460,6 @@ def test_sum_inf(self):
with cf.option_context("mode.use_inf_as_null", True):
assert_almost_equal(s.sum(), s2.sum())

res = nanops.nansum(arr, axis=1)
expected = nanops._nansum(arr, axis=1)
assert_almost_equal(res, expected)

res = nanops.nansum(arr, axis=1)
self.assertTrue(np.isinf(res).all())

Expand Down Expand Up @@ -1594,6 +1590,12 @@ def testit():
# add some NaNs
self.series[5:15] = np.NaN


# idxmax, idxmin, min, and max are valid for dates
if not ('max' in name or 'min' in name):
ds = Series(date_range('1/1/2001', periods=10))
self.assertRaises(TypeError, f, ds)

# skipna or no
self.assert_(notnull(f(self.series)))
self.assert_(isnull(f(self.series, skipna=False)))
Expand Down

0 comments on commit 6dbcc83

Please sign in to comment.