Skip to content

Commit

Permalink
BUG/ENH: idxmin/idxmax NA behavior should be same as other reductions…
Browse files Browse the repository at this point in the history
…, refactoring, bugfix in Cython object conversion function
  • Loading branch information
wesm committed Nov 25, 2011
1 parent 2bf7613 commit c1307b6
Show file tree
Hide file tree
Showing 9 changed files with 108 additions and 90 deletions.
44 changes: 25 additions & 19 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2186,7 +2186,8 @@ def _shift_indexer(self, periods):
#----------------------------------------------------------------------
# Function application

def apply(self, func, axis=0, broadcast=False, raw=False):
def apply(self, func, axis=0, broadcast=False, raw=False,
args=(), **kwds):
"""
Applies function along input axis of DataFrame. Objects passed to
functions are Series objects having index either the DataFrame's index
Expand All @@ -2207,6 +2208,10 @@ def apply(self, func, axis=0, broadcast=False, raw=False):
passed function will receive ndarray objects instead. If you are
just applying a NumPy reduction function this will achieve much
better performance
args : tuple
Positional arguments to pass to function in addition to the
array/series
Additional keyword arguments will be passed as keywords to the function
Examples
--------
Expand All @@ -2226,26 +2231,31 @@ def apply(self, func, axis=0, broadcast=False, raw=False):
if len(self.columns) == 0 and len(self.index) == 0:
return self

if isinstance(func, np.ufunc):
results = func(self.values)
if kwds or args and not isinstance(func, np.ufunc):
f = lambda x: func(x, *args, **kwds)
else:
f = func

if isinstance(f, np.ufunc):
results = f(self.values)
return self._constructor(data=results, index=self.index,
columns=self.columns, copy=False)
else:
if not broadcast:
if not all(self.shape):
is_reduction = not isinstance(func(_EMPTY_SERIES),
is_reduction = not isinstance(f(_EMPTY_SERIES),
np.ndarray)
if is_reduction:
return Series(np.nan, index=self._get_agg_axis(axis))
else:
return self.copy()

if raw and not self._is_mixed_type:
return self._apply_raw(func, axis)
return self._apply_raw(f, axis)
else:
return self._apply_standard(func, axis)
return self._apply_standard(f, axis)
else:
return self._apply_broadcast(func, axis)
return self._apply_broadcast(f, axis)

def _apply_raw(self, func, axis):
try:
Expand Down Expand Up @@ -2857,12 +2867,10 @@ def idxmin(self, axis=0, skipna=True):
-------
idxmin : Series
"""
values = self.values.copy()
if skipna and not issubclass(values.dtype.type, np.integer):
np.putmask(values, -np.isfinite(values), np.inf)
argmin_index = self._get_axis(axis)
return Series([argmin_index[i] for i in values.argmin(axis)],
index=self._get_agg_axis(axis))
indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)
index = self._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return Series(result, index=self._get_agg_axis(axis))

def idxmax(self, axis=0, skipna=True):
"""
Expand All @@ -2881,12 +2889,10 @@ def idxmax(self, axis=0, skipna=True):
-------
idxmax : Series
"""
values = self.values.copy()
if skipna and not issubclass(values.dtype.type, np.integer):
np.putmask(values, -np.isfinite(values), -np.inf)
argmax_index = self._get_axis(axis)
return Series([argmax_index[i] for i in values.argmax(axis)],
index=self._get_agg_axis(axis))
indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna)
index = self._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return Series(result, index=self._get_agg_axis(axis))

def _agg_by_level(self, name, axis=0, level=0, skipna=True):
method = getattr(type(self), name)
Expand Down
42 changes: 42 additions & 0 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,48 @@ def nanprod(values, axis=None, skipna=True, copy=True):
result = values.prod(axis)
return _maybe_null_out(result, axis, mask)

def nanargmax(values, axis=None, skipna=True):
"""
Returns -1 in the NA case
"""
mask = -np.isfinite(values)
if not issubclass(values.dtype.type, np.integer):
values = values.copy()
np.putmask(values, mask, -np.inf)
result = values.argmax(axis)
result = _maybe_arg_null_out(result, axis, mask, skipna)
return result

def nanargmin(values, axis=None, skipna=True):
"""
Returns -1 in the NA case
"""
mask = -np.isfinite(values)
if not issubclass(values.dtype.type, np.integer):
values = values.copy()
np.putmask(values, mask, np.inf)
result = values.argmin(axis)
result = _maybe_arg_null_out(result, axis, mask, skipna)
return result

def _maybe_arg_null_out(result, axis, mask, skipna):
# helper function for nanargmin/nanargmax
if axis is None:
if skipna:
if mask.all():
result = -1
else:
if mask.any():
result = -1
else:
if skipna:
na_mask = mask.all(axis)
else:
na_mask = mask.any(axis)
if na_mask.any():
result[na_mask] = -1
return result

def _get_counts(mask, axis):
if axis is not None:
count = (mask.shape[axis] - mask.sum(axis)).astype(float)
Expand Down
18 changes: 8 additions & 10 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,11 +717,10 @@ def idxmin(self, axis=None, out=None, skipna=True):
-------
idxmin : Index of mimimum of values
"""
arr = self.values.copy()
if skipna:
if not issubclass(arr.dtype.type, np.integer):
np.putmask(arr, isnull(arr), np.inf)
return self.index[arr.argmin()]
i = nanops.nanargmin(self.values, skipna=skipna)
if i == -1:
return np.nan
return self.index[i]

def idxmax(self, axis=None, out=None, skipna=True):
"""
Expand All @@ -736,11 +735,10 @@ def idxmax(self, axis=None, out=None, skipna=True):
-------
idxmax : Index of mimimum of values
"""
arr = self.values.copy()
if skipna:
if not issubclass(arr.dtype.type, np.integer):
np.putmask(arr, isnull(arr), -np.inf)
return self.index[arr.argmax()]
i = nanops.nanargmax(self.values, skipna=skipna)
if i == -1:
return np.nan
return self.index[i]

def _agg_by_level(self, name, level=0, skipna=True):
method = getattr(type(self), name)
Expand Down
9 changes: 7 additions & 2 deletions pandas/src/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def maybe_convert_objects(ndarray[object] objects):
bint seen_float = 0
bint seen_int = 0
bint seen_bool = 0
bint seen_object = 0
bint seen_null = 0
object val, onan
float64_t fval, fnan
Expand Down Expand Up @@ -164,14 +165,18 @@ def maybe_convert_objects(ndarray[object] objects):
seen_float = 1
except Exception:
pass
else:
seen_object = 1

if seen_null:
if seen_float or seen_int:
if (seen_float or seen_int) and not seen_object:
return floats
else:
return objects
else:
if seen_int:
if seen_object:
return objects
elif seen_int:
return ints
elif seen_float:
return floats
Expand Down
13 changes: 6 additions & 7 deletions pandas/src/reduce.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ cdef class Reducer:
chunk.data = arr.data
try:
for i in range(self.nresults):
res = self.f(self.dummy)
res = self.f(chunk)
if i == 0:
result = self._get_result_array(res)
it = <flatiter> PyArray_IterNew(result)
Expand All @@ -70,19 +70,18 @@ cdef class Reducer:
finally:
# so we don't free the wrong memory
chunk.data = dummy_buf

if result.dtype == np.object_:
result = maybe_convert_objects(result)

return result

def _get_result_array(self, object res):
try:
assert(not isinstance(res, np.ndarray))
if hasattr(res, 'dtype'):
result = np.empty(self.nresults, dtype=res.dtype)
else:
result = np.empty(self.nresults, dtype='O')
result = np.empty(self.nresults, dtype='O')
# if hasattr(res, 'dtype'):
# result = np.empty(self.nresults, dtype=res.dtype)
# else:
# result = np.empty(self.nresults, dtype='O')
result[0] = res
except Exception:
raise ValueError('function does not reduce')
Expand Down
56 changes: 9 additions & 47 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3628,66 +3628,28 @@ def test_dot(self):
assert_frame_equal(result, expected)

def test_idxmin(self):
def validate(f, s, axis, skipna):
def get_result(f, i, v, axis, skipna):
if axis == 0:
return (f[i][v], f[i].min(skipna=skipna))
else:
return (f[v][i], f.ix[i].min(skipna=skipna))
for i, v in s.iteritems():
(r1, r2) = get_result(f, i, v, axis, skipna)
if np.isnan(r1) or np.isinf(r1):
self.assert_(np.isnan(r2) or np.isinf(r2))
elif np.isnan(r2) or np.isinf(r2):
self.assert_(np.isnan(r1) or np.isinf(r1))
else:
self.assertEqual(r1, r2)

frame = self.frame
frame.ix[5:10] = np.nan
frame.ix[15:20, -2:] = np.nan
for skipna in [True, False]:
for axis in [0, 1]:
validate(frame,
frame.idxmin(axis=axis, skipna=skipna),
axis,
skipna)
validate(self.intframe,
self.intframe.idxmin(axis=axis, skipna=skipna),
axis,
skipna)
for df in [frame, self.intframe]:
result = df.idxmax(axis=axis, skipna=skipna)
expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
assert_series_equal(result, expected)

self.assertRaises(Exception, frame.idxmin, axis=2)
self.assertRaises(Exception, frame.idxmax, axis=2)

def test_idxmax(self):
def validate(f, s, axis, skipna):
def get_result(f, i, v, axis, skipna):
if axis == 0:
return (f[i][v], f[i].max(skipna=skipna))
else:
return (f[v][i], f.ix[i].max(skipna=skipna))
for i, v in s.iteritems():
(r1, r2) = get_result(f, i, v, axis, skipna)
if np.isnan(r1) or np.isinf(r1):
self.assert_(np.isnan(r2) or np.isinf(r2))
elif np.isnan(r2) or np.isinf(r2):
self.assert_(np.isnan(r1) or np.isinf(r1))
else:
self.assertEqual(r1, r2)

frame = self.frame
frame.ix[5:10] = np.nan
frame.ix[15:20, -2:] = np.nan
for skipna in [True, False]:
for axis in [0, 1]:
validate(frame,
frame.idxmax(axis=axis, skipna=skipna),
axis,
skipna)
validate(self.intframe,
self.intframe.idxmax(axis=axis, skipna=skipna),
axis,
skipna)
for df in [frame, self.intframe]:
result = df.idxmax(axis=axis, skipna=skipna)
expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
assert_series_equal(result, expected)

self.assertRaises(Exception, frame.idxmax, axis=2)

Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,8 @@ def _check_counts(frame, axis=0):
for i in range(index.nlevels):
result = frame.count(axis=axis, level=i)
expected = frame.groupby(axis=axis, level=i).count(axis=axis)
assert_frame_equal(result, expected.reindex_like(result))
expected = expected.reindex_like(result).astype('i8')
assert_frame_equal(result, expected)

self.frame.ix[1, [1, 2]] = np.nan
self.frame.ix[7, [0, 1]] = np.nan
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,7 +695,7 @@ def test_idxmin(self):

# skipna or no
self.assertEqual(self.series[self.series.idxmin()], self.series.min())
self.assert_(isnull(self.series[self.series.idxmin(skipna=False)]))
self.assert_(isnull(self.series.idxmin(skipna=False)))

# no NaNs
nona = self.series.dropna()
Expand All @@ -705,7 +705,7 @@ def test_idxmin(self):

# all NaNs
allna = self.series * nan
self.assertEqual(allna.idxmin(), allna.index[0])
self.assert_(isnull(allna.idxmin()))

def test_idxmax(self):
# test idxmax
Expand All @@ -716,7 +716,7 @@ def test_idxmax(self):

# skipna or no
self.assertEqual(self.series[self.series.idxmax()], self.series.max())
self.assert_(isnull(self.series[self.series.idxmax(skipna=False)]))
self.assert_(isnull(self.series.idxmax(skipna=False)))

# no NaNs
nona = self.series.dropna()
Expand All @@ -726,7 +726,7 @@ def test_idxmax(self):

# all NaNs
allna = self.series * nan
self.assertEqual(allna.idxmax(), allna.index[0])
self.assert_(isnull(allna.idxmax()))

def test_operators_date(self):
result = self.objSeries + timedelta(1)
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/test_tseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,11 @@ def test_duplicated_with_nas():
expected = trues + falses
assert(np.array_equal(result, expected))

def test_convert_objects():
arr = np.array(['a', 'b', np.nan, np.nan, 'd', 'e', 'f'], dtype='O')
result = lib.maybe_convert_objects(arr)
assert(result.dtype == np.object_)

class TestMoments(unittest.TestCase):
pass

Expand Down

0 comments on commit c1307b6

Please sign in to comment.