Skip to content

Commit

Permalink
ENH: use bottleneck for implemented nanops if installed, GH #91
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Jan 12, 2012
1 parent f9f198e commit fbb1102
Show file tree
Hide file tree
Showing 8 changed files with 140 additions and 68 deletions.
4 changes: 4 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ pandas 0.7.0
and multiple Series to ``Series.append`` too
- Added ``justify`` argument to ``DataFrame.to_string`` to allow different
alignment of column headers
- Add ``sort`` option to GroupBy to allow disabling sorting of the group keys
for potential speedups (GH #595)
- Can pass MaskedArray to Series constructor (PR #563)
- Add Panel item access via attributes and IPython completion (GH #554)

**API Changes**

Expand Down
7 changes: 6 additions & 1 deletion TODO.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
meDONE
DOCS 0.7.0
----------
- no sort in groupby
- concat with dict

DONE
----
- SparseSeries name integration + tests
- Refactor Series.repr
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3210,7 +3210,7 @@ def _agg_by_level(self, name, axis=0, level=0, skipna=True):
return grouped.aggregate(applyf)

def _reduce(self, op, axis=0, skipna=True, numeric_only=None):
f = lambda x: op(x, axis=axis, skipna=skipna, copy=True)
f = lambda x: op(x, axis=axis, skipna=skipna)
labels = self._get_agg_axis(axis)
if numeric_only is None:
try:
Expand Down
145 changes: 87 additions & 58 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,52 @@
except ImportError: # pragma: no cover
_USE_BOTTLENECK = False

def nansum(values, axis=None, skipna=True, copy=True):
def _bottleneck_switch(bn_name, alt, **kwargs):
bn_func = getattr(bn, bn_name)

This comment has been minimized.

Copy link
@lodagro

lodagro Jan 12, 2012

Contributor

Here bottleneck is used even when it`s not installed.

def f(values, axis=None, skipna=True):
try:
if _USE_BOTTLENECK and skipna:
result = bn_func(values, axis=axis, **kwargs)
# prefer to treat inf/-inf as NA
if _has_infs(result):
result = alt(values, axis=axis, skipna=skipna, **kwargs)
else:
result = alt(values, axis=axis, skipna=skipna, **kwargs)
except Exception:
result = alt(values, axis=axis, skipna=skipna, **kwargs)

return result

return f

def _has_infs(result):
if isinstance(result, np.ndarray):
if result.dtype == 'f8':
return lib.has_infs_f8(result)
elif result.dtype == 'f4':
return lib.has_infs_f4(result)
else: # pragma: no cover
raise TypeError('Only suppose float32/64 here')
else:
return np.isinf(result) or np.isneginf(result)

def _nansum(values, axis=None, skipna=True):
mask = isnull(values)

if skipna and not issubclass(values.dtype.type, np.integer):
if copy:
values = values.copy()
values = values.copy()
np.putmask(values, mask, 0)

the_sum = values.sum(axis)
the_sum = _maybe_null_out(the_sum, axis, mask)

return the_sum

def nanmean(values, axis=None, skipna=True, copy=True):
def _nanmean(values, axis=None, skipna=True):
mask = isnull(values)

if skipna and not issubclass(values.dtype.type, np.integer):
if copy:
values = values.copy()
values = values.copy()
np.putmask(values, mask, 0)

the_sum = values.sum(axis)
Expand All @@ -44,7 +71,7 @@ def nanmean(values, axis=None, skipna=True, copy=True):
the_mean = the_sum / count if count > 0 else np.nan
return the_mean

def nanmedian(values, axis=None, skipna=True, copy=True):
def _nanmedian(values, axis=None, skipna=True):
def get_median(x):
mask = notnull(x)
if not skipna and not mask.all():
Expand All @@ -59,7 +86,7 @@ def get_median(x):
else:
return get_median(values)

def nanvar(values, axis=None, skipna=True, copy=True, ddof=1):
def _nanvar(values, axis=None, skipna=True, ddof=1):
mask = isnull(values)

if axis is not None:
Expand All @@ -68,52 +95,17 @@ def nanvar(values, axis=None, skipna=True, copy=True, ddof=1):
count = float(values.size - mask.sum())

if skipna:
if copy:
values = values.copy()
values = values.copy()
np.putmask(values, mask, 0)

X = values.sum(axis)
XX = (values ** 2).sum(axis)
return (XX - X ** 2 / count) / (count - ddof)

def nanskew(values, axis=None, skipna=True, copy=True):
if not isinstance(values.dtype.type, np.floating):
values = values.astype('f8')

mask = isnull(values)
count = _get_counts(mask, axis)

if skipna:
if copy:
values = values.copy()
np.putmask(values, mask, 0)

A = values.sum(axis) / count
B = (values ** 2).sum(axis) / count - A ** 2
C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B

# floating point error
B = _zero_out_fperr(B)
C = _zero_out_fperr(C)

result = ((np.sqrt((count ** 2 - count)) * C) /
((count - 2) * np.sqrt(B) ** 3))

if isinstance(result, np.ndarray):
result = np.where(B == 0, 0, result)
result[count < 3] = np.nan
return result
else:
result = 0 if B == 0 else result
if count < 3:
return np.nan
return result

def nanmin(values, axis=None, skipna=True, copy=True):
def _nanmin(values, axis=None, skipna=True):
mask = isnull(values)
if skipna and not issubclass(values.dtype.type, np.integer):
if copy:
values = values.copy()
values = values.copy()
np.putmask(values, mask, np.inf)
# numpy 1.6.1 workaround in Python 3.x
if (values.dtype == np.object_
Expand All @@ -129,11 +121,10 @@ def nanmin(values, axis=None, skipna=True, copy=True):

return _maybe_null_out(result, axis, mask)

def nanmax(values, axis=None, skipna=True, copy=True):
def _nanmax(values, axis=None, skipna=True):
mask = isnull(values)
if skipna and not issubclass(values.dtype.type, np.integer):
if copy:
values = values.copy()
values = values.copy()
np.putmask(values, mask, -np.inf)
# numpy 1.6.1 workaround in Python 3.x
if (values.dtype == np.object_
Expand All @@ -149,15 +140,6 @@ def nanmax(values, axis=None, skipna=True, copy=True):
result = values.max(axis)
return _maybe_null_out(result, axis, mask)

def nanprod(values, axis=None, skipna=True, copy=True):
mask = isnull(values)
if skipna and not issubclass(values.dtype.type, np.integer):
if copy:
values = values.copy()
values[mask] = 1
result = values.prod(axis)
return _maybe_null_out(result, axis, mask)

def nanargmax(values, axis=None, skipna=True):
"""
Returns -1 in the NA case
Expand All @@ -182,6 +164,53 @@ def nanargmin(values, axis=None, skipna=True):
result = _maybe_arg_null_out(result, axis, mask, skipna)
return result

nansum = _bottleneck_switch('nansum', _nansum)
nanmean = _bottleneck_switch('nanmean', _nanmean)
nanmedian = _bottleneck_switch('nanmedian', _nanmedian)
nanvar = _bottleneck_switch('nanvar', _nanvar, ddof=1)
nanmin = _bottleneck_switch('nanmin', _nanmin)
nanmax = _bottleneck_switch('nanmax', _nanmax)

def nanskew(values, axis=None, skipna=True):
if not isinstance(values.dtype.type, np.floating):
values = values.astype('f8')

mask = isnull(values)
count = _get_counts(mask, axis)

if skipna:
values = values.copy()
np.putmask(values, mask, 0)

A = values.sum(axis) / count
B = (values ** 2).sum(axis) / count - A ** 2
C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B

# floating point error
B = _zero_out_fperr(B)
C = _zero_out_fperr(C)

result = ((np.sqrt((count ** 2 - count)) * C) /
((count - 2) * np.sqrt(B) ** 3))

if isinstance(result, np.ndarray):
result = np.where(B == 0, 0, result)
result[count < 3] = np.nan
return result
else:
result = 0 if B == 0 else result
if count < 3:
return np.nan
return result

def nanprod(values, axis=None, skipna=True):
mask = isnull(values)
if skipna and not issubclass(values.dtype.type, np.integer):
values = values.copy()
values[mask] = 1
result = values.prod(axis)
return _maybe_null_out(result, axis, mask)

def _maybe_arg_null_out(result, axis, mask, skipna):
# helper function for nanargmin/nanargmax
if axis is None:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -934,7 +934,7 @@ def apply(self, func, axis='major'):
def _reduce(self, op, axis=0, skipna=True):
axis_name = self._get_axis_name(axis)
axis_number = self._get_axis_number(axis_name)
f = lambda x: op(x, axis=axis_number, skipna=skipna, copy=True)
f = lambda x: op(x, axis=axis_number, skipna=skipna)

result = f(self.values)

Expand Down
13 changes: 6 additions & 7 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,7 @@ def nunique(self):
def sum(self, axis=0, dtype=None, out=None, skipna=True, level=None):
if level is not None:
return self._agg_by_level('sum', level=level, skipna=skipna)
return nanops.nansum(self.values, skipna=skipna, copy=True)
return nanops.nansum(self.values, skipna=skipna)

@Substitution(name='mean', shortname='mean', na_action=_doc_exclude_na,
extras=_doc_ndarray_interface)
Expand Down Expand Up @@ -779,15 +779,15 @@ def prod(self, axis=None, dtype=None, out=None, skipna=True, level=None):
def min(self, axis=None, out=None, skipna=True, level=None):
if level is not None:
return self._agg_by_level('min', level=level, skipna=skipna)
return nanops.nanmin(self.values, skipna=skipna, copy=True)
return nanops.nanmin(self.values, skipna=skipna)

@Substitution(name='maximum', shortname='max',
na_action=_doc_exclude_na, extras='')
@Appender(_stat_doc)
def max(self, axis=None, out=None, skipna=True, level=None):
if level is not None:
return self._agg_by_level('max', level=level, skipna=skipna)
return nanops.nanmax(self.values, skipna=skipna, copy=True)
return nanops.nanmax(self.values, skipna=skipna)

@Substitution(name='unbiased standard deviation', shortname='stdev',
na_action=_doc_exclude_na, extras='')
Expand All @@ -796,8 +796,7 @@ def std(self, axis=None, dtype=None, out=None, ddof=1, skipna=True,
level=None):
if level is not None:
return self._agg_by_level('std', level=level, skipna=skipna)
return np.sqrt(nanops.nanvar(self.values, skipna=skipna, copy=True,
ddof=ddof))
return np.sqrt(nanops.nanvar(self.values, skipna=skipna))

@Substitution(name='unbiased variance', shortname='var',
na_action=_doc_exclude_na, extras='')
Expand All @@ -806,7 +805,7 @@ def var(self, axis=None, dtype=None, out=None, ddof=1, skipna=True,
level=None):
if level is not None:
return self._agg_by_level('var', level=level, skipna=skipna)
return nanops.nanvar(self.values, skipna=skipna, copy=True, ddof=ddof)
return nanops.nanvar(self.values, skipna=skipna)

@Substitution(name='unbiased skewness', shortname='skew',
na_action=_doc_exclude_na, extras='')
Expand All @@ -815,7 +814,7 @@ def skew(self, skipna=True, level=None):
if level is not None:
return self._agg_by_level('skew', level=level, skipna=skipna)

return nanops.nanskew(self.values, skipna=skipna, copy=True)
return nanops.nanskew(self.values, skipna=skipna)

def _agg_by_level(self, name, level=0, skipna=True):
grouped = self.groupby(level=level)
Expand Down
28 changes: 28 additions & 0 deletions pandas/src/tseries.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,34 @@ def fast_zip(list ndarrays):

return result

def has_infs_f4(ndarray[float32_t] arr):
cdef:
Py_ssize_t i, n = len(arr)
float32_t inf, neginf, val

inf = np.inf
neginf = -inf

for i in range(n):
val = arr[i]
if val == inf or val == neginf:
return True
return False

def has_infs_f8(ndarray[float64_t] arr):
cdef:
Py_ssize_t i, n = len(arr)
float64_t inf, neginf, val

inf = np.inf
neginf = -inf

for i in range(n):
val = arr[i]
if val == inf or val == neginf:
return True
return False

# cdef class TypeConverter:
# cdef:
# cpython.PyTypeObject* klass_type
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,13 @@ def test_iteritems(self):
def test_sum(self):
self._check_stat_op('sum', np.sum)

def test_sum_inf(self):
s = Series(np.random.randn(10))
s2 = s.copy()
s[5:8] = np.inf
s2[5:8] = np.nan
assert_almost_equal(s.sum(), s2.sum())

def test_mean(self):
self._check_stat_op('mean', np.mean)

Expand Down

0 comments on commit fbb1102

Please sign in to comment.