pandas-dev · wesm · Dec 2, 2012 · Mar 22, 2012 · Apr 13, 2012 · Apr 27, 2012
diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst
@@ -57,11 +57,11 @@ to handling missing data. While ``NaN`` is the default missing value marker for
 reasons of computational speed and convenience, we need to be able to easily
 detect this value with data of different types: floating point, integer,
 boolean, and general object. In many cases, however, the Python ``None`` will
-arise and we wish to also consider that "missing" or "null". Lastly, for legacy
-reasons ``inf`` and ``-inf`` are also considered to be "null" in
-computations. Since in NumPy divide-by-zero generates ``inf`` or ``-inf`` and
-not ``NaN``, I think you will find this is a worthwhile trade-off (Zen of
-Python: "practicality beats purity").
+arise and we wish to also consider that "missing" or "null".
+
+Until recently, for legacy reasons ``inf`` and ``-inf`` were also
+considered to be "null" in computations. This is no longer the case by
+default; use the :func: `~pandas.core.common.use_inf_as_null` function to recover it.
 
 .. _missing.isnull:
 
@@ -76,8 +76,9 @@ pandas provides the :func:`~pandas.core.common.isnull` and
    isnull(df2['one'])
    df2['four'].notnull()
 
-**Summary:** ``NaN``, ``inf``, ``-inf``, and ``None`` (in object arrays) are
-all considered missing by the ``isnull`` and ``notnull`` functions.
+**Summary:** ``NaN`` and ``None`` (in object arrays) are considered
+missing by the ``isnull`` and ``notnull`` functions. ``inf`` and
+``-inf`` are no longer considered missing by default.
 
 Calculations with missing data
 ------------------------------

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -65,6 +65,58 @@ def isnull(obj):
         return _isnull_ndarraylike(obj)
     else:
         return obj is None
+isnull_new = isnull
+
+def isnull_old(obj):
+    '''
+    Replacement for numpy.isnan / -numpy.isfinite which is suitable
+    for use on object arrays.  Treat None, NaN, INF, -INF as null.
+
+    Parameters
+    ----------
+    arr: ndarray or object value
+
+    Returns
+    -------
+    boolean ndarray or boolean
+    '''
+    if lib.isscalar(obj):
+        return lib.checknull_old(obj)
+
+    from pandas.core.generic import PandasObject
+    if isinstance(obj, np.ndarray):
+        return _isnull_ndarraylike_old(obj)
+    elif isinstance(obj, PandasObject):
+        # TODO: optimize for DataFrame, etc.
+        return obj.apply(isnull_old)
+    elif isinstance(obj, list) or hasattr(obj, '__array__'):
+        return _isnull_ndarraylike_old(obj)
+    else:
+        return obj is None
+
+def use_inf_as_null(flag):
+    '''
+    Choose which replacement for numpy.isnan / -numpy.isfinite is used.
+
+    Parameters
+    ----------
+    flag: bool
+        True means treat None, NaN, INF, -INF as null (old way),
+        False means None and NaN are null, but INF, -INF are not null
+        (new way).
+
+    Notes
+    -----
+    This approach to setting global module values is discussed and
+    approved here:
+
+    * http://stackoverflow.com/questions/4859217/programmatically-creating-variables-in-python/4859312#4859312
+    '''
+    if flag == True:
+        globals()['isnull'] = isnull_old
+    else:
+        globals()['isnull'] = isnull_new
+
 
 def _isnull_ndarraylike(obj):
     from pandas import Series
@@ -90,6 +142,30 @@ def _isnull_ndarraylike(obj):
         result = -np.isfinite(obj)
     return result
 
+def _isnull_ndarraylike_old(obj):
+    from pandas import Series
+    values = np.asarray(obj)
+
+    if values.dtype.kind in ('O', 'S', 'U'):
+        # Working around NumPy ticket 1542
+        shape = values.shape
+
+        if values.dtype.kind in ('S', 'U'):
+            result = np.zeros(values.shape, dtype=bool)
+        else:
+            result = np.empty(shape, dtype=bool)
+            vec = lib.isnullobj_old(values.ravel())
+            result[:] = vec.reshape(shape)
+
+        if isinstance(obj, Series):
+            result = Series(result, index=obj.index, copy=False)
+    elif values.dtype == np.dtype('M8[ns]'):
+        # this is the NaT pattern
+        result = values.view('i8') == lib.iNaT
+    else:
+        result = -np.isfinite(obj)
+    return result
+
 def notnull(obj):
     '''
     Replacement for numpy.isfinite / -numpy.isnan which is suitable

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1126,6 +1126,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
         ----------
         path_or_buf : string or file handle / StringIO
             File path
+        sep : character, default ","
+            Field delimiter for the output file.
         na_rep : string, default ''
             Missing data representation
         float_format : string, default None
@@ -1143,12 +1145,13 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
             sequence should be given if the DataFrame uses MultiIndex.  If
             False do not print fields for index names. Use index_label=False
             for easier importing in R
+        nanRep : deprecated, use na_rep
         mode : Python write mode, default 'w'
-        sep : character, default ","
-            Field delimiter for the output file.
         encoding : string, optional
             a string representing the encoding to use if the contents are
             non-ascii, for python versions prior to 3
+        quoting : optional constant from csv module
+            defaults to csv.QUOTE_MINIMAL
         """
         if nanRep is not None:  # pragma: no cover
             import warnings

diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
@@ -359,6 +359,8 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
             if not seen_float:
                 if '.' in val:
                     seen_float = 1
+                elif 'inf' in val:  # special case to handle +/-inf
+                    seen_float = 1
                 else:
                     ints[i] = <int64_t> fval
 

diff --git a/pandas/src/sparse.pyx b/pandas/src/sparse.pyx
@@ -987,10 +987,12 @@ cdef inline float64_t __rsub(float64_t a, float64_t b):
 
 cdef inline float64_t __div(float64_t a, float64_t b):
     if b == 0:
-        if a >= 0:
+        if a > 0:
             return INF
-        else:
+        elif a < 0:
             return -INF
+        else:
+            return NaN
     else:
         return a / b
 
@@ -999,10 +1001,12 @@ cdef inline float64_t __rdiv(float64_t a, float64_t b):
 
 cdef inline float64_t __floordiv(float64_t a, float64_t b):
     if b == 0:
-        if a >= 0:
+        if a > 0:
             return INF
-        else:
+        elif a < 0:
             return -INF
+        else:
+            return NaN
     else:
         return a // b
 

diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx
@@ -178,6 +178,18 @@ cdef double INF = <double> np.inf
 cdef double NEGINF = -INF
 
 cpdef checknull(object val):
+    if util.is_float_object(val) or util.is_complex_object(val):
+        return val != val and val != INF and val != NEGINF
+    elif util.is_datetime64_object(val):
+        return get_datetime64_value(val) == NPY_NAT
+    elif isinstance(val, _NaT):
+        return True
+    elif is_array(val):
+        return False
+    else:
+        return util._checknull(val)
+
+cpdef checknull_old(object val):
     if util.is_float_object(val) or util.is_complex_object(val):
         return val != val or val == INF or val == NEGINF
     elif util.is_datetime64_object(val):
@@ -189,6 +201,7 @@ cpdef checknull(object val):
     else:
         return util._checknull(val)
 
+
 def isscalar(object val):
     return np.isscalar(val) or val is None or isinstance(val, _Timestamp)
 
@@ -206,6 +219,19 @@ def isnullobj(ndarray[object] arr):
         result[i] = util._checknull(arr[i])
     return result.view(np.bool_)
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def isnullobj_old(ndarray[object] arr):
+    cdef Py_ssize_t i, n
+    cdef object val
+    cdef ndarray[uint8_t] result
+
+    n = len(arr)
+    result = np.zeros(n, dtype=np.uint8)
+    for i from 0 <= i < n:
+        result[i] = util._checknull_old(arr[i])
+    return result.view(np.bool_)
+
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -223,6 +249,22 @@ def isnullobj2d(ndarray[object, ndim=2] arr):
                 result[i, j] = 1
     return result.view(np.bool_)
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def isnullobj2d_old(ndarray[object, ndim=2] arr):
+    cdef Py_ssize_t i, j, n, m
+    cdef object val
+    cdef ndarray[uint8_t, ndim=2] result
+
+    n, m = (<object> arr).shape
+    result = np.zeros((n, m), dtype=np.uint8)
+    for i from 0 <= i < n:
+        for j from 0 <= j < m:
+            val = arr[i, j]
+            if checknull_old(val):
+                result[i, j] = 1
+    return result.view(np.bool_)
+
 def list_to_object_array(list obj):
     '''
     Convert list to object ndarray. Seriously can't believe I had to write this

diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd
@@ -60,6 +60,15 @@ cdef inline is_array(object o):
 
 
 cdef inline bint _checknull(object val):
+    import numpy as np
+    cdef double INF = <double> np.inf
+    cdef double NEGINF = -INF
+    try:
+        return bool(val is None or (val != val and val != INF and val != NEGINF))
+    except ValueError:
+        return False
+
+cdef inline bint _checknull_old(object val):
     try:
         return bool(val is None or val != val)
     except ValueError:

diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
@@ -6,7 +6,7 @@
 import unittest
 
 from pandas import Series, DataFrame, date_range, DatetimeIndex
-from pandas.core.common import notnull, isnull
+from pandas.core.common import notnull, isnull, use_inf_as_null
 import pandas.core.common as com
 import pandas.util.testing as tm
 
@@ -18,9 +18,17 @@ def test_notnull():
     assert notnull(1.)
     assert not notnull(None)
     assert not notnull(np.NaN)
+
+    use_inf_as_null(False)
+    assert notnull(np.inf)
+    assert notnull(-np.inf)
+
+    use_inf_as_null(True)
     assert not notnull(np.inf)
     assert not notnull(-np.inf)
 
+
+
     float_series = Series(np.random.randn(5))
     obj_series = Series(np.random.randn(5), dtype=object)
     assert(isinstance(notnull(float_series), Series))
@@ -30,8 +38,8 @@ def test_isnull():
     assert not isnull(1.)
     assert isnull(None)
     assert isnull(np.NaN)
-    assert isnull(np.inf)
-    assert isnull(-np.inf)
+    assert not isnull(np.inf)
+    assert not isnull(-np.inf)
 
     float_series = Series(np.random.randn(5))
     obj_series = Series(np.random.randn(5), dtype=object)

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -3382,6 +3382,36 @@ def test_to_csv_from_csv(self):
 
         os.remove(path)
 
+    def test_to_csv_from_csv_w_some_infs(self):
+        path = '__tmp__'
+
+        # test roundtrip with inf, -inf, nan, as full columns and mix
+        self.frame['G'] = np.nan
+        self.frame['H'] = self.frame.index.map(lambda x: [np.inf, np.nan][np.random.rand() < .5])
+
+        self.frame.to_csv(path)
+        recons = DataFrame.from_csv(path)
+
+        assert_frame_equal(self.frame, recons)
+        assert_frame_equal(np.isinf(self.frame), np.isinf(recons))
+
+        os.remove(path)
+
+    def test_to_csv_from_csv_w_all_infs(self):
+        path = '__tmp__'
+
+        # test roundtrip with inf, -inf, nan, as full columns and mix
+        self.frame['E'] = np.inf
+        self.frame['F'] = -np.inf
+
+        self.frame.to_csv(path)
+        recons = DataFrame.from_csv(path)
+
+        assert_frame_equal(self.frame, recons)
+        assert_frame_equal(np.isinf(self.frame), np.isinf(recons))
+
+        os.remove(path)
+
     def test_to_csv_multiindex(self):
         path = '__tmp__'
 

diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py
@@ -290,6 +290,16 @@ def test_convert_objects():
     result = lib.maybe_convert_objects(arr)
     assert(result.dtype == np.object_)
 
+def test_convert_infs():
+    arr = np.array(['inf', 'inf', 'inf'], dtype='O')
+    result = lib.maybe_convert_numeric(arr, set(), False)
+    assert(result.dtype == np.float64)
+
+    arr = np.array(['-inf', '-inf', '-inf'], dtype='O')
+    result = lib.maybe_convert_numeric(arr, set(), False)
+    assert(result.dtype == np.float64)
+
+
 def test_convert_objects_ints():
     # test that we can detect many kinds of integers
     dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']

diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -103,8 +103,10 @@ def assert_almost_equal(a, b):
         return
 
     if isinstance(a, (bool, float, int)):
+        if np.isinf(a):
+            assert np.isinf(b), err_msg(a,b)
         # case for zero
-        if abs(a) < 1e-5:
+        elif abs(a) < 1e-5:
             np.testing.assert_almost_equal(
                 a, b, decimal=5, err_msg=err_msg(a, b), verbose=False)
         else: