From 7b79b6611018184324eb962d6b15ba138dee9507 Mon Sep 17 00:00:00 2001 From: Abraham Flaxman Date: Tue, 9 Oct 2012 10:47:07 -0700 Subject: [PATCH] ENH: add method use_inf_as_null to core.common (GH #1919) --- pandas/core/common.py | 76 +++++++++++++++++++++++++++++++++++++ pandas/src/tseries.pyx | 41 ++++++++++++++++++++ pandas/src/util.pxd | 6 +++ pandas/tests/test_common.py | 10 ++++- 4 files changed, 132 insertions(+), 1 deletion(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 668017c29c6ab..bfd8c6348d59a 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -65,6 +65,58 @@ def isnull(obj): return _isnull_ndarraylike(obj) else: return obj is None +isnull_new = isnull + +def isnull_old(obj): + ''' + Replacement for numpy.isnan / -numpy.isfinite which is suitable + for use on object arrays. Treat None, NaN, INF, -INF as null. + + Parameters + ---------- + arr: ndarray or object value + + Returns + ------- + boolean ndarray or boolean + ''' + if lib.isscalar(obj): + return lib.checknull_old(obj) + + from pandas.core.generic import PandasObject + if isinstance(obj, np.ndarray): + return _isnull_ndarraylike_old(obj) + elif isinstance(obj, PandasObject): + # TODO: optimize for DataFrame, etc. + return obj.apply(isnull_old) + elif isinstance(obj, list) or hasattr(obj, '__array__'): + return _isnull_ndarraylike_old(obj) + else: + return obj is None + +def use_inf_as_null(flag): + ''' + Choose which replacement for numpy.isnan / -numpy.isfinite is used. + + Parameters + ---------- + flag: bool + True means treat None, NaN, INF, -INF as null (old way), + False means None and NaN are null, but INF, -INF are not null + (new way). + + Notes + ----- + This approach to setting global module values is discussed and + approved here: + + * http://stackoverflow.com/questions/4859217/programmatically-creating-variables-in-python/4859312#4859312 + ''' + if flag == True: + globals()['isnull'] = isnull_old + else: + globals()['isnull'] = isnull_new + def _isnull_ndarraylike(obj): from pandas import Series @@ -90,6 +142,30 @@ def _isnull_ndarraylike(obj): result = -np.isfinite(obj) return result +def _isnull_ndarraylike_old(obj): + from pandas import Series + values = np.asarray(obj) + + if values.dtype.kind in ('O', 'S', 'U'): + # Working around NumPy ticket 1542 + shape = values.shape + + if values.dtype.kind in ('S', 'U'): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj_old(values.ravel()) + result[:] = vec.reshape(shape) + + if isinstance(obj, Series): + result = Series(result, index=obj.index, copy=False) + elif values.dtype == np.dtype('M8[ns]'): + # this is the NaT pattern + result = values.view('i8') == lib.iNaT + else: + result = -np.isfinite(obj) + return result + def notnull(obj): ''' Replacement for numpy.isfinite / -numpy.isnan which is suitable diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index 237b0220aa34d..65250c27bfd57 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -189,6 +189,18 @@ cpdef checknull(object val): else: return util._checknull(val) +cpdef checknull_old(object val): + if util.is_float_object(val) or util.is_complex_object(val): + return val != val or val == INF or val == NEGINF + elif util.is_datetime64_object(val): + return get_datetime64_value(val) == NPY_NAT + elif isinstance(val, _NaT): + return True + elif is_array(val): + return False + else: + return util._checknull(val) + def isscalar(object val): return np.isscalar(val) or val is None or isinstance(val, _Timestamp) @@ -207,6 +219,19 @@ def isnullobj(ndarray[object] arr): result[i] = util._checknull(arr[i]) return result.view(np.bool_) +@cython.wraparound(False) +@cython.boundscheck(False) +def isnullobj_old(ndarray[object] arr): + cdef Py_ssize_t i, n + cdef object val + cdef ndarray[uint8_t] result + + n = len(arr) + result = np.zeros(n, dtype=np.uint8) + for i from 0 <= i < n: + result[i] = util._checknull_old(arr[i]) + return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) @@ -224,6 +249,22 @@ def isnullobj2d(ndarray[object, ndim=2] arr): result[i, j] = 1 return result.view(np.bool_) +@cython.wraparound(False) +@cython.boundscheck(False) +def isnullobj2d_old(ndarray[object, ndim=2] arr): + cdef Py_ssize_t i, j, n, m + cdef object val + cdef ndarray[uint8_t, ndim=2] result + + n, m = ( arr).shape + result = np.zeros((n, m), dtype=np.uint8) + for i from 0 <= i < n: + for j from 0 <= j < m: + val = arr[i, j] + if checknull_old(val): + result[i, j] = 1 + return result.view(np.bool_) + def list_to_object_array(list obj): ''' Convert list to object ndarray. Seriously can't believe I had to write this diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd index 62bf9dcaa8250..5d789e73973cc 100644 --- a/pandas/src/util.pxd +++ b/pandas/src/util.pxd @@ -68,5 +68,11 @@ cdef inline bint _checknull(object val): except ValueError: return False +cdef inline bint _checknull_old(object val): + try: + return bool(val is None or val != val) + except ValueError: + return False + cdef inline bint _checknan(object val): return not cnp.PyArray_Check(val) and val != val diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 865a9de941c2f..753c6a721cd94 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -6,7 +6,7 @@ import unittest from pandas import Series, DataFrame, date_range, DatetimeIndex -from pandas.core.common import notnull, isnull +from pandas.core.common import notnull, isnull, use_inf_as_null import pandas.core.common as com import pandas.util.testing as tm @@ -18,9 +18,17 @@ def test_notnull(): assert notnull(1.) assert not notnull(None) assert not notnull(np.NaN) + + use_inf_as_null(False) assert notnull(np.inf) assert notnull(-np.inf) + use_inf_as_null(True) + assert not notnull(np.inf) + assert not notnull(-np.inf) + + + float_series = Series(np.random.randn(5)) obj_series = Series(np.random.randn(5), dtype=object) assert(isinstance(notnull(float_series), Series))