Skip to content

Commit

Permalink
BUG: Fix behavior of argmax and argmin with inf (#16449)
Browse files Browse the repository at this point in the history
Closes #13595

The implementations of `nanargmin` and `nanargmax` in `nanops` were
forcing the `_get_values` utility function to always mask out infinite
values. For example, in `nanargmax`,

    >>> nanops._get_values(np.array([1, np.nan, np.inf]), True,
    isfinite=True, fill_value_typ='-inf')

    (array([  1., -inf, -inf]),
     array([False,  True,  True], dtype=bool),
     dtype('float64'),
     numpy.float64)

The first element of the result tuple (the masked version of the
values array) is used for actually finding the max or min argument. As
a result, infinite values could never be correctly recognized as the
maximum or minimum values in an array.

This also affects the behavior of `DataFrame.groupby.idxmax`: since
`Series.idxmax` previously raised a `ValueError` with string data, the
group by would silently drop columns that contained strings.
  • Loading branch information
DGrady authored and Daniel Grady committed Jun 22, 2017
1 parent 18f7b1c commit d409035
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 7 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ Sparse
Reshaping
^^^^^^^^^


- `argmin`, `argmax`, `idxmin`, and `idxmax` on Series, DataFrame, and GroupBy objects work correctly with floating point data that contains infinite values (:issue:`13595`). These functions now also work with string data, as long as there are no missing values.

Numeric
^^^^^^^
Expand Down
6 changes: 2 additions & 4 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,8 +474,7 @@ def nanargmax(values, axis=None, skipna=True):
"""
Returns -1 in the NA case
"""
values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf',
isfinite=True)
values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf')
result = values.argmax(axis)
result = _maybe_arg_null_out(result, axis, mask, skipna)
return result
Expand All @@ -485,8 +484,7 @@ def nanargmin(values, axis=None, skipna=True):
"""
Returns -1 in the NA case
"""
values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf',
isfinite=True)
values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf')
result = values.argmin(axis)
result = _maybe_arg_null_out(result, axis, mask, skipna)
return result
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2339,7 +2339,8 @@ def test_non_cython_api(self):
assert_frame_equal(result, expected)

# idxmax
expected = DataFrame([[0], [nan]], columns=['B'], index=[1, 3])
expected = DataFrame([[0.0, 0.0], [nan, 2.0]], columns=['B', 'C'],
index=[1, 3])
expected.index.name = 'A'
result = g.idxmax()
assert_frame_equal(result, expected)
Expand Down
68 changes: 67 additions & 1 deletion pandas/tests/series/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pandas.core.indexes.timedeltas import Timedelta
import pandas.core.nanops as nanops

from pandas.compat import range, zip
from pandas.compat import range, zip, PY3
from pandas import compat
from pandas.util.testing import (assert_series_equal, assert_almost_equal,
assert_frame_equal, assert_index_equal)
Expand Down Expand Up @@ -1857,3 +1857,69 @@ def test_op_duplicate_index(self):
result = s1 + s2
expected = pd.Series([11, 12, np.nan], index=[1, 1, 2])
assert_series_equal(result, expected)

def test_argminmax(self):
# Series.argmin, Series.argmax are aliased to Series.idxmin,
# Series.idxmax

# Expected behavior for empty Series
s = pd.Series([])

with pytest.raises(ValueError):
s.argmin()
with pytest.raises(ValueError):
s.argmin(skipna=False)
with pytest.raises(ValueError):
s.argmax()
with pytest.raises(ValueError):
s.argmax(skipna=False)

# For numeric data with NA and Inf (GH #13595)
s = pd.Series([0, -np.inf, np.inf, np.nan])

assert s.argmin() == 1
assert np.isnan(s.argmin(skipna=False))

assert s.argmax() == 2
assert np.isnan(s.argmax(skipna=False))

# Using old-style behavior that treats floating point nan, -inf, and
# +inf as missing
s = pd.Series([0, -np.inf, np.inf, np.nan])

with pd.option_context('mode.use_inf_as_null', True):
assert s.argmin() == 0
assert np.isnan(s.argmin(skipna=False))
assert s.argmax() == 0
np.isnan(s.argmax(skipna=False))

# For non-NA strings
s = pd.Series(['foo', 'foo', 'bar', 'bar', 'baz'])

assert s.argmin() == 2
assert s.argmin(skipna=False) == 2

assert s.argmax() == 0
assert s.argmax(skipna=False) == 0

# For mixed string and NA
# This works differently under Python 2 and 3: under Python 2,
# comparing strings and None, for example, is valid, and we can
# compute an argmax. Under Python 3, such comparisons are not valid
# and raise a TypeError.
s = pd.Series(['foo', 'foo', 'bar', 'bar', None, np.nan, 'baz'])

if PY3:
with pytest.raises(TypeError):
s.argmin()
with pytest.raises(TypeError):
s.argmin(skipna=False)
with pytest.raises(TypeError):
s.argmax()
with pytest.raises(TypeError):
s.argmax(skipna=False)
else:
assert s.argmin() == 4
assert np.isnan(s.argmin(skipna=False))
assert s.argmax() == 0
assert np.isnan(s.argmax(skipna=False))

0 comments on commit d409035

Please sign in to comment.