From c1307b68d5cc16848be832db67937fd9e639800e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 25 Nov 2011 15:11:54 -0500 Subject: [PATCH] BUG/ENH: idxmin/idxmax NA behavior should be same as other reductions, refactoring, bugfix in Cython object conversion function --- pandas/core/frame.py | 44 +++++++++++++++----------- pandas/core/nanops.py | 42 +++++++++++++++++++++++++ pandas/core/series.py | 18 +++++------ pandas/src/parsing.pyx | 9 ++++-- pandas/src/reduce.pyx | 13 ++++---- pandas/tests/test_frame.py | 56 ++++++--------------------------- pandas/tests/test_multilevel.py | 3 +- pandas/tests/test_series.py | 8 ++--- pandas/tests/test_tseries.py | 5 +++ 9 files changed, 108 insertions(+), 90 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5593bb86804c1..d819abc4683e3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2186,7 +2186,8 @@ def _shift_indexer(self, periods): #---------------------------------------------------------------------- # Function application - def apply(self, func, axis=0, broadcast=False, raw=False): + def apply(self, func, axis=0, broadcast=False, raw=False, + args=(), **kwds): """ Applies function along input axis of DataFrame. Objects passed to functions are Series objects having index either the DataFrame's index @@ -2207,6 +2208,10 @@ def apply(self, func, axis=0, broadcast=False, raw=False): passed function will receive ndarray objects instead. If you are just applying a NumPy reduction function this will achieve much better performance + args : tuple + Positional arguments to pass to function in addition to the + array/series + Additional keyword arguments will be passed as keywords to the function Examples -------- @@ -2226,14 +2231,19 @@ def apply(self, func, axis=0, broadcast=False, raw=False): if len(self.columns) == 0 and len(self.index) == 0: return self - if isinstance(func, np.ufunc): - results = func(self.values) + if kwds or args and not isinstance(func, np.ufunc): + f = lambda x: func(x, *args, **kwds) + else: + f = func + + if isinstance(f, np.ufunc): + results = f(self.values) return self._constructor(data=results, index=self.index, columns=self.columns, copy=False) else: if not broadcast: if not all(self.shape): - is_reduction = not isinstance(func(_EMPTY_SERIES), + is_reduction = not isinstance(f(_EMPTY_SERIES), np.ndarray) if is_reduction: return Series(np.nan, index=self._get_agg_axis(axis)) @@ -2241,11 +2251,11 @@ def apply(self, func, axis=0, broadcast=False, raw=False): return self.copy() if raw and not self._is_mixed_type: - return self._apply_raw(func, axis) + return self._apply_raw(f, axis) else: - return self._apply_standard(func, axis) + return self._apply_standard(f, axis) else: - return self._apply_broadcast(func, axis) + return self._apply_broadcast(f, axis) def _apply_raw(self, func, axis): try: @@ -2857,12 +2867,10 @@ def idxmin(self, axis=0, skipna=True): ------- idxmin : Series """ - values = self.values.copy() - if skipna and not issubclass(values.dtype.type, np.integer): - np.putmask(values, -np.isfinite(values), np.inf) - argmin_index = self._get_axis(axis) - return Series([argmin_index[i] for i in values.argmin(axis)], - index=self._get_agg_axis(axis)) + indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) + index = self._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return Series(result, index=self._get_agg_axis(axis)) def idxmax(self, axis=0, skipna=True): """ @@ -2881,12 +2889,10 @@ def idxmax(self, axis=0, skipna=True): ------- idxmax : Series """ - values = self.values.copy() - if skipna and not issubclass(values.dtype.type, np.integer): - np.putmask(values, -np.isfinite(values), -np.inf) - argmax_index = self._get_axis(axis) - return Series([argmax_index[i] for i in values.argmax(axis)], - index=self._get_agg_axis(axis)) + indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna) + index = self._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return Series(result, index=self._get_agg_axis(axis)) def _agg_by_level(self, name, axis=0, level=0, skipna=True): method = getattr(type(self), name) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 46b710a4029af..4fadd5d1f338d 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -139,6 +139,48 @@ def nanprod(values, axis=None, skipna=True, copy=True): result = values.prod(axis) return _maybe_null_out(result, axis, mask) +def nanargmax(values, axis=None, skipna=True): + """ + Returns -1 in the NA case + """ + mask = -np.isfinite(values) + if not issubclass(values.dtype.type, np.integer): + values = values.copy() + np.putmask(values, mask, -np.inf) + result = values.argmax(axis) + result = _maybe_arg_null_out(result, axis, mask, skipna) + return result + +def nanargmin(values, axis=None, skipna=True): + """ + Returns -1 in the NA case + """ + mask = -np.isfinite(values) + if not issubclass(values.dtype.type, np.integer): + values = values.copy() + np.putmask(values, mask, np.inf) + result = values.argmin(axis) + result = _maybe_arg_null_out(result, axis, mask, skipna) + return result + +def _maybe_arg_null_out(result, axis, mask, skipna): + # helper function for nanargmin/nanargmax + if axis is None: + if skipna: + if mask.all(): + result = -1 + else: + if mask.any(): + result = -1 + else: + if skipna: + na_mask = mask.all(axis) + else: + na_mask = mask.any(axis) + if na_mask.any(): + result[na_mask] = -1 + return result + def _get_counts(mask, axis): if axis is not None: count = (mask.shape[axis] - mask.sum(axis)).astype(float) diff --git a/pandas/core/series.py b/pandas/core/series.py index 897c4b6f7e53f..68136f9b79e9d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -717,11 +717,10 @@ def idxmin(self, axis=None, out=None, skipna=True): ------- idxmin : Index of mimimum of values """ - arr = self.values.copy() - if skipna: - if not issubclass(arr.dtype.type, np.integer): - np.putmask(arr, isnull(arr), np.inf) - return self.index[arr.argmin()] + i = nanops.nanargmin(self.values, skipna=skipna) + if i == -1: + return np.nan + return self.index[i] def idxmax(self, axis=None, out=None, skipna=True): """ @@ -736,11 +735,10 @@ def idxmax(self, axis=None, out=None, skipna=True): ------- idxmax : Index of mimimum of values """ - arr = self.values.copy() - if skipna: - if not issubclass(arr.dtype.type, np.integer): - np.putmask(arr, isnull(arr), -np.inf) - return self.index[arr.argmax()] + i = nanops.nanargmax(self.values, skipna=skipna) + if i == -1: + return np.nan + return self.index[i] def _agg_by_level(self, name, level=0, skipna=True): method = getattr(type(self), name) diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx index 874fa8d193ae4..76f62991afbe5 100644 --- a/pandas/src/parsing.pyx +++ b/pandas/src/parsing.pyx @@ -126,6 +126,7 @@ def maybe_convert_objects(ndarray[object] objects): bint seen_float = 0 bint seen_int = 0 bint seen_bool = 0 + bint seen_object = 0 bint seen_null = 0 object val, onan float64_t fval, fnan @@ -164,14 +165,18 @@ def maybe_convert_objects(ndarray[object] objects): seen_float = 1 except Exception: pass + else: + seen_object = 1 if seen_null: - if seen_float or seen_int: + if (seen_float or seen_int) and not seen_object: return floats else: return objects else: - if seen_int: + if seen_object: + return objects + elif seen_int: return ints elif seen_float: return floats diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx index 9f54f1a8fee70..e325c727d3ad0 100644 --- a/pandas/src/reduce.pyx +++ b/pandas/src/reduce.pyx @@ -59,7 +59,7 @@ cdef class Reducer: chunk.data = arr.data try: for i in range(self.nresults): - res = self.f(self.dummy) + res = self.f(chunk) if i == 0: result = self._get_result_array(res) it = PyArray_IterNew(result) @@ -70,19 +70,18 @@ cdef class Reducer: finally: # so we don't free the wrong memory chunk.data = dummy_buf - if result.dtype == np.object_: result = maybe_convert_objects(result) - return result def _get_result_array(self, object res): try: assert(not isinstance(res, np.ndarray)) - if hasattr(res, 'dtype'): - result = np.empty(self.nresults, dtype=res.dtype) - else: - result = np.empty(self.nresults, dtype='O') + result = np.empty(self.nresults, dtype='O') + # if hasattr(res, 'dtype'): + # result = np.empty(self.nresults, dtype=res.dtype) + # else: + # result = np.empty(self.nresults, dtype='O') result[0] = res except Exception: raise ValueError('function does not reduce') diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index c8ad273a46cbb..d16a2daa3b471 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3628,66 +3628,28 @@ def test_dot(self): assert_frame_equal(result, expected) def test_idxmin(self): - def validate(f, s, axis, skipna): - def get_result(f, i, v, axis, skipna): - if axis == 0: - return (f[i][v], f[i].min(skipna=skipna)) - else: - return (f[v][i], f.ix[i].min(skipna=skipna)) - for i, v in s.iteritems(): - (r1, r2) = get_result(f, i, v, axis, skipna) - if np.isnan(r1) or np.isinf(r1): - self.assert_(np.isnan(r2) or np.isinf(r2)) - elif np.isnan(r2) or np.isinf(r2): - self.assert_(np.isnan(r1) or np.isinf(r1)) - else: - self.assertEqual(r1, r2) - frame = self.frame frame.ix[5:10] = np.nan frame.ix[15:20, -2:] = np.nan for skipna in [True, False]: for axis in [0, 1]: - validate(frame, - frame.idxmin(axis=axis, skipna=skipna), - axis, - skipna) - validate(self.intframe, - self.intframe.idxmin(axis=axis, skipna=skipna), - axis, - skipna) + for df in [frame, self.intframe]: + result = df.idxmax(axis=axis, skipna=skipna) + expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) + assert_series_equal(result, expected) - self.assertRaises(Exception, frame.idxmin, axis=2) + self.assertRaises(Exception, frame.idxmax, axis=2) def test_idxmax(self): - def validate(f, s, axis, skipna): - def get_result(f, i, v, axis, skipna): - if axis == 0: - return (f[i][v], f[i].max(skipna=skipna)) - else: - return (f[v][i], f.ix[i].max(skipna=skipna)) - for i, v in s.iteritems(): - (r1, r2) = get_result(f, i, v, axis, skipna) - if np.isnan(r1) or np.isinf(r1): - self.assert_(np.isnan(r2) or np.isinf(r2)) - elif np.isnan(r2) or np.isinf(r2): - self.assert_(np.isnan(r1) or np.isinf(r1)) - else: - self.assertEqual(r1, r2) - frame = self.frame frame.ix[5:10] = np.nan frame.ix[15:20, -2:] = np.nan for skipna in [True, False]: for axis in [0, 1]: - validate(frame, - frame.idxmax(axis=axis, skipna=skipna), - axis, - skipna) - validate(self.intframe, - self.intframe.idxmax(axis=axis, skipna=skipna), - axis, - skipna) + for df in [frame, self.intframe]: + result = df.idxmax(axis=axis, skipna=skipna) + expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) + assert_series_equal(result, expected) self.assertRaises(Exception, frame.idxmax, axis=2) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 6d94de237fcda..969e2775e8087 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -294,7 +294,8 @@ def _check_counts(frame, axis=0): for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count(axis=axis) - assert_frame_equal(result, expected.reindex_like(result)) + expected = expected.reindex_like(result).astype('i8') + assert_frame_equal(result, expected) self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 90eb84d098891..cae6f5cfea542 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -695,7 +695,7 @@ def test_idxmin(self): # skipna or no self.assertEqual(self.series[self.series.idxmin()], self.series.min()) - self.assert_(isnull(self.series[self.series.idxmin(skipna=False)])) + self.assert_(isnull(self.series.idxmin(skipna=False))) # no NaNs nona = self.series.dropna() @@ -705,7 +705,7 @@ def test_idxmin(self): # all NaNs allna = self.series * nan - self.assertEqual(allna.idxmin(), allna.index[0]) + self.assert_(isnull(allna.idxmin())) def test_idxmax(self): # test idxmax @@ -716,7 +716,7 @@ def test_idxmax(self): # skipna or no self.assertEqual(self.series[self.series.idxmax()], self.series.max()) - self.assert_(isnull(self.series[self.series.idxmax(skipna=False)])) + self.assert_(isnull(self.series.idxmax(skipna=False))) # no NaNs nona = self.series.dropna() @@ -726,7 +726,7 @@ def test_idxmax(self): # all NaNs allna = self.series * nan - self.assertEqual(allna.idxmax(), allna.index[0]) + self.assert_(isnull(allna.idxmax())) def test_operators_date(self): result = self.objSeries + timedelta(1) diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index 17678b59769e4..89342c8a75192 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -171,6 +171,11 @@ def test_duplicated_with_nas(): expected = trues + falses assert(np.array_equal(result, expected)) +def test_convert_objects(): + arr = np.array(['a', 'b', np.nan, np.nan, 'd', 'e', 'f'], dtype='O') + result = lib.maybe_convert_objects(arr) + assert(result.dtype == np.object_) + class TestMoments(unittest.TestCase): pass