Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

To csv infs #2050

Merged
merged 18 commits into from
Dec 2, 2012
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions doc/source/missing_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,11 @@ to handling missing data. While ``NaN`` is the default missing value marker for
reasons of computational speed and convenience, we need to be able to easily
detect this value with data of different types: floating point, integer,
boolean, and general object. In many cases, however, the Python ``None`` will
arise and we wish to also consider that "missing" or "null". Lastly, for legacy
reasons ``inf`` and ``-inf`` are also considered to be "null" in
computations. Since in NumPy divide-by-zero generates ``inf`` or ``-inf`` and
not ``NaN``, I think you will find this is a worthwhile trade-off (Zen of
Python: "practicality beats purity").
arise and we wish to also consider that "missing" or "null".

Until recently, for legacy reasons ``inf`` and ``-inf`` were also
considered to be "null" in computations. This is no longer the case by
default; use the :func: `~pandas.core.common.use_inf_as_null` function to recover it.

.. _missing.isnull:

Expand All @@ -76,8 +76,9 @@ pandas provides the :func:`~pandas.core.common.isnull` and
isnull(df2['one'])
df2['four'].notnull()

**Summary:** ``NaN``, ``inf``, ``-inf``, and ``None`` (in object arrays) are
all considered missing by the ``isnull`` and ``notnull`` functions.
**Summary:** ``NaN`` and ``None`` (in object arrays) are considered
missing by the ``isnull`` and ``notnull`` functions. ``inf`` and
``-inf`` are no longer considered missing by default.

Calculations with missing data
------------------------------
Expand Down
76 changes: 76 additions & 0 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,58 @@ def isnull(obj):
return _isnull_ndarraylike(obj)
else:
return obj is None
isnull_new = isnull

def isnull_old(obj):
'''
Replacement for numpy.isnan / -numpy.isfinite which is suitable
for use on object arrays. Treat None, NaN, INF, -INF as null.

Parameters
----------
arr: ndarray or object value

Returns
-------
boolean ndarray or boolean
'''
if lib.isscalar(obj):
return lib.checknull_old(obj)

from pandas.core.generic import PandasObject
if isinstance(obj, np.ndarray):
return _isnull_ndarraylike_old(obj)
elif isinstance(obj, PandasObject):
# TODO: optimize for DataFrame, etc.
return obj.apply(isnull_old)
elif isinstance(obj, list) or hasattr(obj, '__array__'):
return _isnull_ndarraylike_old(obj)
else:
return obj is None

def use_inf_as_null(flag):
'''
Choose which replacement for numpy.isnan / -numpy.isfinite is used.

Parameters
----------
flag: bool
True means treat None, NaN, INF, -INF as null (old way),
False means None and NaN are null, but INF, -INF are not null
(new way).

Notes
-----
This approach to setting global module values is discussed and
approved here:

* http://stackoverflow.com/questions/4859217/programmatically-creating-variables-in-python/4859312#4859312
'''
if flag == True:
globals()['isnull'] = isnull_old
else:
globals()['isnull'] = isnull_new


def _isnull_ndarraylike(obj):
from pandas import Series
Expand All @@ -90,6 +142,30 @@ def _isnull_ndarraylike(obj):
result = -np.isfinite(obj)
return result

def _isnull_ndarraylike_old(obj):
from pandas import Series
values = np.asarray(obj)

if values.dtype.kind in ('O', 'S', 'U'):
# Working around NumPy ticket 1542
shape = values.shape

if values.dtype.kind in ('S', 'U'):
result = np.zeros(values.shape, dtype=bool)
else:
result = np.empty(shape, dtype=bool)
vec = lib.isnullobj_old(values.ravel())
result[:] = vec.reshape(shape)

if isinstance(obj, Series):
result = Series(result, index=obj.index, copy=False)
elif values.dtype == np.dtype('M8[ns]'):
# this is the NaT pattern
result = values.view('i8') == lib.iNaT
else:
result = -np.isfinite(obj)
return result

def notnull(obj):
'''
Replacement for numpy.isfinite / -numpy.isnan which is suitable
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1126,6 +1126,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
----------
path_or_buf : string or file handle / StringIO
File path
sep : character, default ","
Field delimiter for the output file.
na_rep : string, default ''
Missing data representation
float_format : string, default None
Expand All @@ -1143,12 +1145,13 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
sequence should be given if the DataFrame uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R
nanRep : deprecated, use na_rep
mode : Python write mode, default 'w'
sep : character, default ","
Field delimiter for the output file.
encoding : string, optional
a string representing the encoding to use if the contents are
non-ascii, for python versions prior to 3
quoting : optional constant from csv module
defaults to csv.QUOTE_MINIMAL
"""
if nanRep is not None: # pragma: no cover
import warnings
Expand Down
2 changes: 2 additions & 0 deletions pandas/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,8 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
if not seen_float:
if '.' in val:
seen_float = 1
elif 'inf' in val: # special case to handle +/-inf
seen_float = 1
else:
ints[i] = <int64_t> fval

Expand Down
12 changes: 8 additions & 4 deletions pandas/src/sparse.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -987,10 +987,12 @@ cdef inline float64_t __rsub(float64_t a, float64_t b):

cdef inline float64_t __div(float64_t a, float64_t b):
if b == 0:
if a >= 0:
if a > 0:
return INF
else:
elif a < 0:
return -INF
else:
return NaN
else:
return a / b

Expand All @@ -999,10 +1001,12 @@ cdef inline float64_t __rdiv(float64_t a, float64_t b):

cdef inline float64_t __floordiv(float64_t a, float64_t b):
if b == 0:
if a >= 0:
if a > 0:
return INF
else:
elif a < 0:
return -INF
else:
return NaN
else:
return a // b

Expand Down
42 changes: 42 additions & 0 deletions pandas/src/tseries.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,18 @@ cdef double INF = <double> np.inf
cdef double NEGINF = -INF

cpdef checknull(object val):
if util.is_float_object(val) or util.is_complex_object(val):
return val != val and val != INF and val != NEGINF
elif util.is_datetime64_object(val):
return get_datetime64_value(val) == NPY_NAT
elif isinstance(val, _NaT):
return True
elif is_array(val):
return False
else:
return util._checknull(val)

cpdef checknull_old(object val):
if util.is_float_object(val) or util.is_complex_object(val):
return val != val or val == INF or val == NEGINF
elif util.is_datetime64_object(val):
Expand All @@ -189,6 +201,7 @@ cpdef checknull(object val):
else:
return util._checknull(val)


def isscalar(object val):
return np.isscalar(val) or val is None or isinstance(val, _Timestamp)

Expand All @@ -206,6 +219,19 @@ def isnullobj(ndarray[object] arr):
result[i] = util._checknull(arr[i])
return result.view(np.bool_)

@cython.wraparound(False)
@cython.boundscheck(False)
def isnullobj_old(ndarray[object] arr):
cdef Py_ssize_t i, n
cdef object val
cdef ndarray[uint8_t] result

n = len(arr)
result = np.zeros(n, dtype=np.uint8)
for i from 0 <= i < n:
result[i] = util._checknull_old(arr[i])
return result.view(np.bool_)


@cython.wraparound(False)
@cython.boundscheck(False)
Expand All @@ -223,6 +249,22 @@ def isnullobj2d(ndarray[object, ndim=2] arr):
result[i, j] = 1
return result.view(np.bool_)

@cython.wraparound(False)
@cython.boundscheck(False)
def isnullobj2d_old(ndarray[object, ndim=2] arr):
cdef Py_ssize_t i, j, n, m
cdef object val
cdef ndarray[uint8_t, ndim=2] result

n, m = (<object> arr).shape
result = np.zeros((n, m), dtype=np.uint8)
for i from 0 <= i < n:
for j from 0 <= j < m:
val = arr[i, j]
if checknull_old(val):
result[i, j] = 1
return result.view(np.bool_)

def list_to_object_array(list obj):
'''
Convert list to object ndarray. Seriously can't believe I had to write this
Expand Down
9 changes: 9 additions & 0 deletions pandas/src/util.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,15 @@ cdef inline is_array(object o):


cdef inline bint _checknull(object val):
import numpy as np
cdef double INF = <double> np.inf
cdef double NEGINF = -INF
try:
return bool(val is None or (val != val and val != INF and val != NEGINF))
except ValueError:
return False

cdef inline bint _checknull_old(object val):
try:
return bool(val is None or val != val)
except ValueError:
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import unittest

from pandas import Series, DataFrame, date_range, DatetimeIndex
from pandas.core.common import notnull, isnull
from pandas.core.common import notnull, isnull, use_inf_as_null
import pandas.core.common as com
import pandas.util.testing as tm

Expand All @@ -18,9 +18,17 @@ def test_notnull():
assert notnull(1.)
assert not notnull(None)
assert not notnull(np.NaN)

use_inf_as_null(False)
assert notnull(np.inf)
assert notnull(-np.inf)

use_inf_as_null(True)
assert not notnull(np.inf)
assert not notnull(-np.inf)



float_series = Series(np.random.randn(5))
obj_series = Series(np.random.randn(5), dtype=object)
assert(isinstance(notnull(float_series), Series))
Expand All @@ -30,8 +38,8 @@ def test_isnull():
assert not isnull(1.)
assert isnull(None)
assert isnull(np.NaN)
assert isnull(np.inf)
assert isnull(-np.inf)
assert not isnull(np.inf)
assert not isnull(-np.inf)

float_series = Series(np.random.randn(5))
obj_series = Series(np.random.randn(5), dtype=object)
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3382,6 +3382,36 @@ def test_to_csv_from_csv(self):

os.remove(path)

def test_to_csv_from_csv_w_some_infs(self):
path = '__tmp__'

# test roundtrip with inf, -inf, nan, as full columns and mix
self.frame['G'] = np.nan
self.frame['H'] = self.frame.index.map(lambda x: [np.inf, np.nan][np.random.rand() < .5])

self.frame.to_csv(path)
recons = DataFrame.from_csv(path)

assert_frame_equal(self.frame, recons)
assert_frame_equal(np.isinf(self.frame), np.isinf(recons))

os.remove(path)

def test_to_csv_from_csv_w_all_infs(self):
path = '__tmp__'

# test roundtrip with inf, -inf, nan, as full columns and mix
self.frame['E'] = np.inf
self.frame['F'] = -np.inf

self.frame.to_csv(path)
recons = DataFrame.from_csv(path)

assert_frame_equal(self.frame, recons)
assert_frame_equal(np.isinf(self.frame), np.isinf(recons))

os.remove(path)

def test_to_csv_multiindex(self):
path = '__tmp__'

Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/test_tseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,16 @@ def test_convert_objects():
result = lib.maybe_convert_objects(arr)
assert(result.dtype == np.object_)

def test_convert_infs():
arr = np.array(['inf', 'inf', 'inf'], dtype='O')
result = lib.maybe_convert_numeric(arr, set(), False)
assert(result.dtype == np.float64)

arr = np.array(['-inf', '-inf', '-inf'], dtype='O')
result = lib.maybe_convert_numeric(arr, set(), False)
assert(result.dtype == np.float64)


def test_convert_objects_ints():
# test that we can detect many kinds of integers
dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
Expand Down
4 changes: 3 additions & 1 deletion pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,10 @@ def assert_almost_equal(a, b):
return

if isinstance(a, (bool, float, int)):
if np.isinf(a):
assert np.isinf(b), err_msg(a,b)
# case for zero
if abs(a) < 1e-5:
elif abs(a) < 1e-5:
np.testing.assert_almost_equal(
a, b, decimal=5, err_msg=err_msg(a, b), verbose=False)
else:
Expand Down