Skip to content

Commit

Permalink
BUG: Fix Series.nlargest for integer boundary values (#21432)
Browse files Browse the repository at this point in the history
(cherry picked from commit ec5956e)
  • Loading branch information
jschendel authored and jorisvandenbossche committed Jun 29, 2018
1 parent 787ef30 commit 28b831f
Show file tree
Hide file tree
Showing 5 changed files with 321 additions and 43 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,5 @@ Bug Fixes

**Other**

- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`)
-
71 changes: 71 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@ def join_type(request):
return request.param


@pytest.fixture(params=['nlargest', 'nsmallest'])
def nselect_method(request):
"""
Fixture for trying all nselect methods
"""
return request.param


@pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')])
def nulls_fixture(request):
"""
Expand Down Expand Up @@ -170,3 +178,66 @@ def string_dtype(request):
* 'U'
"""
return request.param


@pytest.fixture(params=["float32", "float64"])
def float_dtype(request):
"""
Parameterized fixture for float dtypes.
* float32
* float64
"""

return request.param


UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"]
SIGNED_INT_DTYPES = ["int8", "int16", "int32", "int64"]
ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES


@pytest.fixture(params=SIGNED_INT_DTYPES)
def sint_dtype(request):
"""
Parameterized fixture for signed integer dtypes.
* int8
* int16
* int32
* int64
"""

return request.param


@pytest.fixture(params=UNSIGNED_INT_DTYPES)
def uint_dtype(request):
"""
Parameterized fixture for unsigned integer dtypes.
* uint8
* uint16
* uint32
* uint64
"""

return request.param


@pytest.fixture(params=ALL_INT_DTYPES)
def any_int_dtype(request):
"""
Parameterized fixture for any integer dtypes.
* int8
* uint8
* int16
* uint16
* int32
* uint32
* int64
* uint64
"""

return request.param
5 changes: 4 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1131,9 +1131,12 @@ def compute(self, method):
return dropped[slc].sort_values(ascending=ascending).head(n)

# fast method
arr, _, _ = _ensure_data(dropped.values)
arr, pandas_dtype, _ = _ensure_data(dropped.values)
if method == 'nlargest':
arr = -arr
if is_integer_dtype(pandas_dtype):
# GH 21426: ensure reverse ordering at boundaries
arr -= 1

if self.keep == 'last':
arr = arr[::-1]
Expand Down
78 changes: 36 additions & 42 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from numpy.random import randn
import numpy as np

from pandas.compat import lrange, product, PY35
from pandas.compat import lrange, PY35
from pandas import (compat, isna, notna, DataFrame, Series,
MultiIndex, date_range, Timestamp, Categorical,
_np_version_under1p12, _np_version_under1p15)
Expand Down Expand Up @@ -2240,54 +2240,49 @@ class TestNLargestNSmallest(object):

# ----------------------------------------------------------------------
# Top / bottom
@pytest.mark.parametrize(
'method, n, order',
product(['nsmallest', 'nlargest'], range(1, 11),
[['a'],
['c'],
['a', 'b'],
['a', 'c'],
['b', 'a'],
['b', 'c'],
['a', 'b', 'c'],
['c', 'a', 'b'],
['c', 'b', 'a'],
['b', 'c', 'a'],
['b', 'a', 'c'],
# dups!
['b', 'c', 'c'],
]))
def test_n(self, df_strings, method, n, order):
@pytest.mark.parametrize('order', [
['a'],
['c'],
['a', 'b'],
['a', 'c'],
['b', 'a'],
['b', 'c'],
['a', 'b', 'c'],
['c', 'a', 'b'],
['c', 'b', 'a'],
['b', 'c', 'a'],
['b', 'a', 'c'],
# dups!
['b', 'c', 'c']])
@pytest.mark.parametrize('n', range(1, 11))
def test_n(self, df_strings, nselect_method, n, order):
# GH10393
df = df_strings
if 'b' in order:

error_msg = self.dtype_error_msg_template.format(
column='b', method=method, dtype='object')
column='b', method=nselect_method, dtype='object')
with tm.assert_raises_regex(TypeError, error_msg):
getattr(df, method)(n, order)
getattr(df, nselect_method)(n, order)
else:
ascending = method == 'nsmallest'
result = getattr(df, method)(n, order)
ascending = nselect_method == 'nsmallest'
result = getattr(df, nselect_method)(n, order)
expected = df.sort_values(order, ascending=ascending).head(n)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
'method, columns',
product(['nsmallest', 'nlargest'],
product(['group'], ['category_string', 'string'])
))
def test_n_error(self, df_main_dtypes, method, columns):
@pytest.mark.parametrize('columns', [
('group', 'category_string'), ('group', 'string')])
def test_n_error(self, df_main_dtypes, nselect_method, columns):
df = df_main_dtypes
col = columns[1]
error_msg = self.dtype_error_msg_template.format(
column=columns[1], method=method, dtype=df[columns[1]].dtype)
column=col, method=nselect_method, dtype=df[col].dtype)
# escape some characters that may be in the repr
error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)")
.replace("[", "\\[").replace("]", "\\]"))
with tm.assert_raises_regex(TypeError, error_msg):
getattr(df, method)(2, columns)
getattr(df, nselect_method)(2, columns)

def test_n_all_dtypes(self, df_main_dtypes):
df = df_main_dtypes
Expand All @@ -2308,15 +2303,14 @@ def test_n_identical_values(self):
expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]})
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
'n, order',
product([1, 2, 3, 4, 5],
[['a', 'b', 'c'],
['c', 'b', 'a'],
['a'],
['b'],
['a', 'b'],
['c', 'b']]))
@pytest.mark.parametrize('order', [
['a', 'b', 'c'],
['c', 'b', 'a'],
['a'],
['b'],
['a', 'b'],
['c', 'b']])
@pytest.mark.parametrize('n', range(1, 6))
def test_n_duplicate_index(self, df_duplicates, n, order):
# GH 13412

Expand Down
Loading

0 comments on commit 28b831f

Please sign in to comment.