From d409035e133c663687fe08584b510011f0800d84 Mon Sep 17 00:00:00 2001 From: Daniel Grady Date: Mon, 22 May 2017 16:00:17 -0700 Subject: [PATCH] BUG: Fix behavior of argmax and argmin with inf (#16449) Closes #13595 The implementations of `nanargmin` and `nanargmax` in `nanops` were forcing the `_get_values` utility function to always mask out infinite values. For example, in `nanargmax`, >>> nanops._get_values(np.array([1, np.nan, np.inf]), True, isfinite=True, fill_value_typ='-inf') (array([ 1., -inf, -inf]), array([False, True, True], dtype=bool), dtype('float64'), numpy.float64) The first element of the result tuple (the masked version of the values array) is used for actually finding the max or min argument. As a result, infinite values could never be correctly recognized as the maximum or minimum values in an array. This also affects the behavior of `DataFrame.groupby.idxmax`: since `Series.idxmax` previously raised a `ValueError` with string data, the group by would silently drop columns that contained strings. --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/nanops.py | 6 +-- pandas/tests/groupby/test_groupby.py | 3 +- pandas/tests/series/test_operators.py | 68 ++++++++++++++++++++++++++- 4 files changed, 72 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 9d330cf3fdf2d..08f58086aa8a5 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -127,7 +127,7 @@ Sparse Reshaping ^^^^^^^^^ - +- `argmin`, `argmax`, `idxmin`, and `idxmax` on Series, DataFrame, and GroupBy objects work correctly with floating point data that contains infinite values (:issue:`13595`). These functions now also work with string data, as long as there are no missing values. Numeric ^^^^^^^ diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 1d64f87b15761..9630dba7917cf 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -474,8 +474,7 @@ def nanargmax(values, axis=None, skipna=True): """ Returns -1 in the NA case """ - values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf', - isfinite=True) + values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf') result = values.argmax(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result @@ -485,8 +484,7 @@ def nanargmin(values, axis=None, skipna=True): """ Returns -1 in the NA case """ - values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf', - isfinite=True) + values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf') result = values.argmin(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 19124a33bdbcb..1dfb07f2d729f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2339,7 +2339,8 @@ def test_non_cython_api(self): assert_frame_equal(result, expected) # idxmax - expected = DataFrame([[0], [nan]], columns=['B'], index=[1, 3]) + expected = DataFrame([[0.0, 0.0], [nan, 2.0]], columns=['B', 'C'], + index=[1, 3]) expected.index.name = 'A' result = g.idxmax() assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 2e400812e0331..dec8ca1dc3f4b 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -20,7 +20,7 @@ from pandas.core.indexes.timedeltas import Timedelta import pandas.core.nanops as nanops -from pandas.compat import range, zip +from pandas.compat import range, zip, PY3 from pandas import compat from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, assert_index_equal) @@ -1857,3 +1857,69 @@ def test_op_duplicate_index(self): result = s1 + s2 expected = pd.Series([11, 12, np.nan], index=[1, 1, 2]) assert_series_equal(result, expected) + + def test_argminmax(self): + # Series.argmin, Series.argmax are aliased to Series.idxmin, + # Series.idxmax + + # Expected behavior for empty Series + s = pd.Series([]) + + with pytest.raises(ValueError): + s.argmin() + with pytest.raises(ValueError): + s.argmin(skipna=False) + with pytest.raises(ValueError): + s.argmax() + with pytest.raises(ValueError): + s.argmax(skipna=False) + + # For numeric data with NA and Inf (GH #13595) + s = pd.Series([0, -np.inf, np.inf, np.nan]) + + assert s.argmin() == 1 + assert np.isnan(s.argmin(skipna=False)) + + assert s.argmax() == 2 + assert np.isnan(s.argmax(skipna=False)) + + # Using old-style behavior that treats floating point nan, -inf, and + # +inf as missing + s = pd.Series([0, -np.inf, np.inf, np.nan]) + + with pd.option_context('mode.use_inf_as_null', True): + assert s.argmin() == 0 + assert np.isnan(s.argmin(skipna=False)) + assert s.argmax() == 0 + np.isnan(s.argmax(skipna=False)) + + # For non-NA strings + s = pd.Series(['foo', 'foo', 'bar', 'bar', 'baz']) + + assert s.argmin() == 2 + assert s.argmin(skipna=False) == 2 + + assert s.argmax() == 0 + assert s.argmax(skipna=False) == 0 + + # For mixed string and NA + # This works differently under Python 2 and 3: under Python 2, + # comparing strings and None, for example, is valid, and we can + # compute an argmax. Under Python 3, such comparisons are not valid + # and raise a TypeError. + s = pd.Series(['foo', 'foo', 'bar', 'bar', None, np.nan, 'baz']) + + if PY3: + with pytest.raises(TypeError): + s.argmin() + with pytest.raises(TypeError): + s.argmin(skipna=False) + with pytest.raises(TypeError): + s.argmax() + with pytest.raises(TypeError): + s.argmax(skipna=False) + else: + assert s.argmin() == 4 + assert np.isnan(s.argmin(skipna=False)) + assert s.argmax() == 0 + assert np.isnan(s.argmax(skipna=False))