Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix Series.nlargest for integer boundary values #21432

Merged
merged 4 commits into from
Jun 15, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,5 @@ Bug Fixes

**Other**

- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`)
-
71 changes: 71 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@ def join_type(request):
return request.param


@pytest.fixture(params=['nlargest', 'nsmallest'])
def nselect_method(request):
"""
Fixture for trying all nselect methods
"""
return request.param


@pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')])
def nulls_fixture(request):
"""
Expand Down Expand Up @@ -170,3 +178,66 @@ def string_dtype(request):
* 'U'
"""
return request.param


@pytest.fixture(params=["float32", "float64"])
def float_dtype(request):
"""
Parameterized fixture for float dtypes.

* float32
* float64
"""

return request.param


UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"]
SIGNED_INT_DTYPES = ["int8", "int16", "int32", "int64"]
ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES


@pytest.fixture(params=SIGNED_INT_DTYPES)
def sint_dtype(request):
"""
Parameterized fixture for signed integer dtypes.

* int8
* int16
* int32
* int64
"""

return request.param


@pytest.fixture(params=UNSIGNED_INT_DTYPES)
def uint_dtype(request):
"""
Parameterized fixture for unsigned integer dtypes.

* uint8
* uint16
* uint32
* uint64
"""

return request.param


@pytest.fixture(params=ALL_INT_DTYPES)
def any_int_dtype(request):
"""
Parameterized fixture for any integer dtypes.

* int8
* uint8
* int16
* uint16
* int32
* uint32
* int64
* uint64
"""

return request.param
5 changes: 4 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1133,9 +1133,12 @@ def compute(self, method):
return dropped[slc].sort_values(ascending=ascending).head(n)

# fast method
arr, _, _ = _ensure_data(dropped.values)
arr, pandas_dtype, _ = _ensure_data(dropped.values)
if method == 'nlargest':
arr = -arr
if is_integer_dtype(pandas_dtype):
# GH 21426: ensure reverse ordering at boundaries
arr -= 1

if self.keep == 'last':
arr = arr[::-1]
Expand Down
78 changes: 36 additions & 42 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from numpy.random import randn
import numpy as np

from pandas.compat import lrange, product, PY35
from pandas.compat import lrange, PY35
from pandas import (compat, isna, notna, DataFrame, Series,
MultiIndex, date_range, Timestamp, Categorical,
_np_version_under1p12, _np_version_under1p15,
Expand Down Expand Up @@ -2260,54 +2260,49 @@ class TestNLargestNSmallest(object):

# ----------------------------------------------------------------------
# Top / bottom
@pytest.mark.parametrize(
'method, n, order',
product(['nsmallest', 'nlargest'], range(1, 11),
[['a'],
['c'],
['a', 'b'],
['a', 'c'],
['b', 'a'],
['b', 'c'],
['a', 'b', 'c'],
['c', 'a', 'b'],
['c', 'b', 'a'],
['b', 'c', 'a'],
['b', 'a', 'c'],

# dups!
['b', 'c', 'c'],

]))
def test_n(self, df_strings, method, n, order):
@pytest.mark.parametrize('order', [
['a'],
['c'],
['a', 'b'],
['a', 'c'],
['b', 'a'],
['b', 'c'],
['a', 'b', 'c'],
['c', 'a', 'b'],
['c', 'b', 'a'],
['b', 'c', 'a'],
['b', 'a', 'c'],

# dups!
['b', 'c', 'c']])
@pytest.mark.parametrize('n', range(1, 11))
def test_n(self, df_strings, nselect_method, n, order):
# GH10393
df = df_strings
if 'b' in order:

error_msg = self.dtype_error_msg_template.format(
column='b', method=method, dtype='object')
column='b', method=nselect_method, dtype='object')
with tm.assert_raises_regex(TypeError, error_msg):
getattr(df, method)(n, order)
getattr(df, nselect_method)(n, order)
else:
ascending = method == 'nsmallest'
result = getattr(df, method)(n, order)
ascending = nselect_method == 'nsmallest'
result = getattr(df, nselect_method)(n, order)
expected = df.sort_values(order, ascending=ascending).head(n)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
'method, columns',
product(['nsmallest', 'nlargest'],
product(['group'], ['category_string', 'string'])
))
def test_n_error(self, df_main_dtypes, method, columns):
@pytest.mark.parametrize('columns', [
('group', 'category_string'), ('group', 'string')])
def test_n_error(self, df_main_dtypes, nselect_method, columns):
df = df_main_dtypes
col = columns[1]
error_msg = self.dtype_error_msg_template.format(
column=columns[1], method=method, dtype=df[columns[1]].dtype)
column=col, method=nselect_method, dtype=df[col].dtype)
# escape some characters that may be in the repr
error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)")
.replace("[", "\\[").replace("]", "\\]"))
with tm.assert_raises_regex(TypeError, error_msg):
getattr(df, method)(2, columns)
getattr(df, nselect_method)(2, columns)

def test_n_all_dtypes(self, df_main_dtypes):
df = df_main_dtypes
Expand All @@ -2328,15 +2323,14 @@ def test_n_identical_values(self):
expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]})
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
'n, order',
product([1, 2, 3, 4, 5],
[['a', 'b', 'c'],
['c', 'b', 'a'],
['a'],
['b'],
['a', 'b'],
['c', 'b']]))
@pytest.mark.parametrize('order', [
['a', 'b', 'c'],
['c', 'b', 'a'],
['a'],
['b'],
['a', 'b'],
['c', 'b']])
@pytest.mark.parametrize('n', range(1, 6))
def test_n_duplicate_index(self, df_duplicates, n, order):
# GH 13412

Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1944,6 +1944,15 @@ def test_mode_sortwarning(self):
tm.assert_series_equal(result, expected)


def assert_check_nselect_boundary(vals, dtype, method):
# helper function for 'test_boundary_{dtype}' tests
s = Series(vals, dtype=dtype)
result = getattr(s, method)(3)
expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1]
expected = s.loc[expected_idxr]
tm.assert_series_equal(result, expected)


class TestNLargestNSmallest(object):

@pytest.mark.parametrize(
Expand Down Expand Up @@ -2028,6 +2037,32 @@ def test_n(self, n):
expected = s.sort_values().head(n)
assert_series_equal(result, expected)

def test_boundary_integer(self, nselect_method, any_int_dtype):
# GH 21426
dtype_info = np.iinfo(any_int_dtype)
min_val, max_val = dtype_info.min, dtype_info.max
vals = [min_val, min_val + 1, max_val - 1, max_val]
assert_check_nselect_boundary(vals, any_int_dtype, nselect_method)

def test_boundary_float(self, nselect_method, float_dtype):
# GH 21426
dtype_info = np.finfo(float_dtype)
min_val, max_val = dtype_info.min, dtype_info.max
min_2nd, max_2nd = np.nextafter(
[min_val, max_val], 0, dtype=float_dtype)
vals = [min_val, min_2nd, max_2nd, max_val]
assert_check_nselect_boundary(vals, float_dtype, nselect_method)

@pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]'])
def test_boundary_datetimelike(self, nselect_method, dtype):
# GH 21426
# use int64 bounds and +1 to min_val since true minimum is NaT
# (include min_val/NaT at end to maintain same expected_idxr)
dtype_info = np.iinfo('int64')
min_val, max_val = dtype_info.min, dtype_info.max
vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val]
assert_check_nselect_boundary(vals, dtype, nselect_method)


class TestCategoricalSeriesAnalytics(object):

Expand Down