From b247e840ae4eb4ae19e9a7298e950ee75384378f Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 11 Jun 2018 17:28:30 -0600 Subject: [PATCH 1/4] BUG: Fix Series.nlargest for integer boundary values --- pandas/core/algorithms.py | 5 ++++- pandas/tests/series/test_analytics.py | 29 +++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b33c10da7813e..9e34b8eb55ccb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1133,9 +1133,12 @@ def compute(self, method): return dropped[slc].sort_values(ascending=ascending).head(n) # fast method - arr, _, _ = _ensure_data(dropped.values) + arr, pandas_dtype, _ = _ensure_data(dropped.values) if method == 'nlargest': arr = -arr + if is_integer_dtype(pandas_dtype): + # GH 21426: ensure reverse ordering at boundaries + arr -= 1 if self.keep == 'last': arr = arr[::-1] diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index aba472f2ce8f9..1d855271fd56b 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -13,6 +13,8 @@ from pandas import (Series, Categorical, DataFrame, isna, notna, bdate_range, date_range, _np_version_under1p10, CategoricalIndex) +from pandas.core.dtypes.common import ( + is_float_dtype, is_integer_dtype, is_datetimelike) from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp from pandas.core.indexes.timedeltas import Timedelta @@ -2028,6 +2030,33 @@ def test_n(self, n): expected = s.sort_values().head(n) assert_series_equal(result, expected) + @pytest.mark.parametrize('dtype', [ + 'int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64', + 'float16', 'float32', 'float64', + 'datetime64[ns]', 'timedelta64[ns]']) + @pytest.mark.parametrize('method', ['nsmallest', 'nlargest']) + def test_boundary(self, method, dtype): + # GH 21426 + if is_float_dtype(dtype): + min_val, max_val = np.finfo(dtype).min, np.finfo(dtype).max + min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=dtype) + vals = [min_val, min_2nd, max_2nd, max_val] + elif is_integer_dtype(dtype): + min_val, max_val = np.iinfo(dtype).min, np.iinfo(dtype).max + vals = [min_val, min_val + 1, max_val - 1, max_val] + elif is_datetimelike(dtype): + # use int64 bounds and +1 to min_val since true minimum is NaT + # (include min_val/NaT at end to maintain same expected_idxr) + min_val, max_val = np.iinfo('int64').min, np.iinfo('int64').max + vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] + + s = Series(vals, dtype=dtype) + result = getattr(s, method)(3) + expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1] + expected = s.loc[expected_idxr] + tm.assert_series_equal(result, expected) + class TestCategoricalSeriesAnalytics(object): From b336dfbd05282a072abf613b20b5a1d49ec9e7f6 Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 11 Jun 2018 18:32:43 -0600 Subject: [PATCH 2/4] add whatsnew --- doc/source/whatsnew/v0.23.2.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 79a4c3da2ffa4..b8d865195cddd 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -82,4 +82,5 @@ Bug Fixes **Other** +- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) - From b5e803cf34801d62ab1d37923107fef73ed31b55 Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 11 Jun 2018 18:49:36 -0600 Subject: [PATCH 3/4] split tests --- pandas/tests/series/test_analytics.py | 55 ++++++++++++++++----------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 1d855271fd56b..980805b340bf7 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -13,8 +13,6 @@ from pandas import (Series, Categorical, DataFrame, isna, notna, bdate_range, date_range, _np_version_under1p10, CategoricalIndex) -from pandas.core.dtypes.common import ( - is_float_dtype, is_integer_dtype, is_datetimelike) from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp from pandas.core.indexes.timedeltas import Timedelta @@ -1948,6 +1946,10 @@ def test_mode_sortwarning(self): class TestNLargestNSmallest(object): + @pytest.fixture(params=['nlargest', 'nsmallest']) + def method(self, request): + return request.param + @pytest.mark.parametrize( "r", [Series([3., 2, 1, 2, '5'], dtype='object'), Series([3., 2, 1, 2, 5], dtype='object'), @@ -2030,33 +2032,40 @@ def test_n(self, n): expected = s.sort_values().head(n) assert_series_equal(result, expected) - @pytest.mark.parametrize('dtype', [ - 'int8', 'int16', 'int32', 'int64', - 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', - 'datetime64[ns]', 'timedelta64[ns]']) - @pytest.mark.parametrize('method', ['nsmallest', 'nlargest']) - def test_boundary(self, method, dtype): - # GH 21426 - if is_float_dtype(dtype): - min_val, max_val = np.finfo(dtype).min, np.finfo(dtype).max - min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=dtype) - vals = [min_val, min_2nd, max_2nd, max_val] - elif is_integer_dtype(dtype): - min_val, max_val = np.iinfo(dtype).min, np.iinfo(dtype).max - vals = [min_val, min_val + 1, max_val - 1, max_val] - elif is_datetimelike(dtype): - # use int64 bounds and +1 to min_val since true minimum is NaT - # (include min_val/NaT at end to maintain same expected_idxr) - min_val, max_val = np.iinfo('int64').min, np.iinfo('int64').max - vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] - + def _check_nselect_boundary(self, vals, dtype, method): + # helper function for 'test_boundary_dtype' tests s = Series(vals, dtype=dtype) result = getattr(s, method)(3) expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1] expected = s.loc[expected_idxr] tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('dtype', [ + 'int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64']) + def test_boundary_integer(self, method, dtype): + # GH 21426 + min_val, max_val = np.iinfo(dtype).min, np.iinfo(dtype).max + vals = [min_val, min_val + 1, max_val - 1, max_val] + self._check_nselect_boundary(vals, dtype, method) + + @pytest.mark.parametrize('dtype', ['float16', 'float32', 'float64']) + def test_boundary_float(self, method, dtype): + # GH 21426 + min_val, max_val = np.finfo(dtype).min, np.finfo(dtype).max + min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=dtype) + vals = [min_val, min_2nd, max_2nd, max_val] + self._check_nselect_boundary(vals, dtype, method) + + @pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]']) + def test_boundary_datetimelike(self, method, dtype): + # GH 21426 + # use int64 bounds and +1 to min_val since true minimum is NaT + # (include min_val/NaT at end to maintain same expected_idxr) + min_val, max_val = np.iinfo('int64').min, np.iinfo('int64').max + vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] + self._check_nselect_boundary(vals, dtype, method) + class TestCategoricalSeriesAnalytics(object): From fb9d532cfb1ddcfc2a6957b5f25b93d8a04df5b1 Mon Sep 17 00:00:00 2001 From: jschendel Date: Wed, 13 Jun 2018 17:39:29 -0600 Subject: [PATCH 4/4] review edits --- pandas/conftest.py | 71 ++++++++++++++++++++++++ pandas/tests/frame/test_analytics.py | 78 +++++++++++++-------------- pandas/tests/series/test_analytics.py | 49 ++++++++--------- 3 files changed, 130 insertions(+), 68 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index d5f399c7cd63d..9d806a91f37f7 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -129,6 +129,14 @@ def join_type(request): return request.param +@pytest.fixture(params=['nlargest', 'nsmallest']) +def nselect_method(request): + """ + Fixture for trying all nselect methods + """ + return request.param + + @pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')]) def nulls_fixture(request): """ @@ -170,3 +178,66 @@ def string_dtype(request): * 'U' """ return request.param + + +@pytest.fixture(params=["float32", "float64"]) +def float_dtype(request): + """ + Parameterized fixture for float dtypes. + + * float32 + * float64 + """ + + return request.param + + +UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] +SIGNED_INT_DTYPES = ["int8", "int16", "int32", "int64"] +ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES + + +@pytest.fixture(params=SIGNED_INT_DTYPES) +def sint_dtype(request): + """ + Parameterized fixture for signed integer dtypes. + + * int8 + * int16 + * int32 + * int64 + """ + + return request.param + + +@pytest.fixture(params=UNSIGNED_INT_DTYPES) +def uint_dtype(request): + """ + Parameterized fixture for unsigned integer dtypes. + + * uint8 + * uint16 + * uint32 + * uint64 + """ + + return request.param + + +@pytest.fixture(params=ALL_INT_DTYPES) +def any_int_dtype(request): + """ + Parameterized fixture for any integer dtypes. + + * int8 + * uint8 + * int16 + * uint16 + * int32 + * uint32 + * int64 + * uint64 + """ + + return request.param diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b8f1acc2aa679..6dc24ed856017 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -12,7 +12,7 @@ from numpy.random import randn import numpy as np -from pandas.compat import lrange, product, PY35 +from pandas.compat import lrange, PY35 from pandas import (compat, isna, notna, DataFrame, Series, MultiIndex, date_range, Timestamp, Categorical, _np_version_under1p12, _np_version_under1p15, @@ -2260,54 +2260,49 @@ class TestNLargestNSmallest(object): # ---------------------------------------------------------------------- # Top / bottom - @pytest.mark.parametrize( - 'method, n, order', - product(['nsmallest', 'nlargest'], range(1, 11), - [['a'], - ['c'], - ['a', 'b'], - ['a', 'c'], - ['b', 'a'], - ['b', 'c'], - ['a', 'b', 'c'], - ['c', 'a', 'b'], - ['c', 'b', 'a'], - ['b', 'c', 'a'], - ['b', 'a', 'c'], - - # dups! - ['b', 'c', 'c'], - - ])) - def test_n(self, df_strings, method, n, order): + @pytest.mark.parametrize('order', [ + ['a'], + ['c'], + ['a', 'b'], + ['a', 'c'], + ['b', 'a'], + ['b', 'c'], + ['a', 'b', 'c'], + ['c', 'a', 'b'], + ['c', 'b', 'a'], + ['b', 'c', 'a'], + ['b', 'a', 'c'], + + # dups! + ['b', 'c', 'c']]) + @pytest.mark.parametrize('n', range(1, 11)) + def test_n(self, df_strings, nselect_method, n, order): # GH10393 df = df_strings if 'b' in order: error_msg = self.dtype_error_msg_template.format( - column='b', method=method, dtype='object') + column='b', method=nselect_method, dtype='object') with tm.assert_raises_regex(TypeError, error_msg): - getattr(df, method)(n, order) + getattr(df, nselect_method)(n, order) else: - ascending = method == 'nsmallest' - result = getattr(df, method)(n, order) + ascending = nselect_method == 'nsmallest' + result = getattr(df, nselect_method)(n, order) expected = df.sort_values(order, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - 'method, columns', - product(['nsmallest', 'nlargest'], - product(['group'], ['category_string', 'string']) - )) - def test_n_error(self, df_main_dtypes, method, columns): + @pytest.mark.parametrize('columns', [ + ('group', 'category_string'), ('group', 'string')]) + def test_n_error(self, df_main_dtypes, nselect_method, columns): df = df_main_dtypes + col = columns[1] error_msg = self.dtype_error_msg_template.format( - column=columns[1], method=method, dtype=df[columns[1]].dtype) + column=col, method=nselect_method, dtype=df[col].dtype) # escape some characters that may be in the repr error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)") .replace("[", "\\[").replace("]", "\\]")) with tm.assert_raises_regex(TypeError, error_msg): - getattr(df, method)(2, columns) + getattr(df, nselect_method)(2, columns) def test_n_all_dtypes(self, df_main_dtypes): df = df_main_dtypes @@ -2328,15 +2323,14 @@ def test_n_identical_values(self): expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]}) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - 'n, order', - product([1, 2, 3, 4, 5], - [['a', 'b', 'c'], - ['c', 'b', 'a'], - ['a'], - ['b'], - ['a', 'b'], - ['c', 'b']])) + @pytest.mark.parametrize('order', [ + ['a', 'b', 'c'], + ['c', 'b', 'a'], + ['a'], + ['b'], + ['a', 'b'], + ['c', 'b']]) + @pytest.mark.parametrize('n', range(1, 6)) def test_n_duplicate_index(self, df_duplicates, n, order): # GH 13412 diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 980805b340bf7..b9c7b837b8b81 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1944,11 +1944,16 @@ def test_mode_sortwarning(self): tm.assert_series_equal(result, expected) -class TestNLargestNSmallest(object): +def assert_check_nselect_boundary(vals, dtype, method): + # helper function for 'test_boundary_{dtype}' tests + s = Series(vals, dtype=dtype) + result = getattr(s, method)(3) + expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1] + expected = s.loc[expected_idxr] + tm.assert_series_equal(result, expected) + - @pytest.fixture(params=['nlargest', 'nsmallest']) - def method(self, request): - return request.param +class TestNLargestNSmallest(object): @pytest.mark.parametrize( "r", [Series([3., 2, 1, 2, '5'], dtype='object'), @@ -2032,39 +2037,31 @@ def test_n(self, n): expected = s.sort_values().head(n) assert_series_equal(result, expected) - def _check_nselect_boundary(self, vals, dtype, method): - # helper function for 'test_boundary_dtype' tests - s = Series(vals, dtype=dtype) - result = getattr(s, method)(3) - expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1] - expected = s.loc[expected_idxr] - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dtype', [ - 'int8', 'int16', 'int32', 'int64', - 'uint8', 'uint16', 'uint32', 'uint64']) - def test_boundary_integer(self, method, dtype): + def test_boundary_integer(self, nselect_method, any_int_dtype): # GH 21426 - min_val, max_val = np.iinfo(dtype).min, np.iinfo(dtype).max + dtype_info = np.iinfo(any_int_dtype) + min_val, max_val = dtype_info.min, dtype_info.max vals = [min_val, min_val + 1, max_val - 1, max_val] - self._check_nselect_boundary(vals, dtype, method) + assert_check_nselect_boundary(vals, any_int_dtype, nselect_method) - @pytest.mark.parametrize('dtype', ['float16', 'float32', 'float64']) - def test_boundary_float(self, method, dtype): + def test_boundary_float(self, nselect_method, float_dtype): # GH 21426 - min_val, max_val = np.finfo(dtype).min, np.finfo(dtype).max - min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=dtype) + dtype_info = np.finfo(float_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + min_2nd, max_2nd = np.nextafter( + [min_val, max_val], 0, dtype=float_dtype) vals = [min_val, min_2nd, max_2nd, max_val] - self._check_nselect_boundary(vals, dtype, method) + assert_check_nselect_boundary(vals, float_dtype, nselect_method) @pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]']) - def test_boundary_datetimelike(self, method, dtype): + def test_boundary_datetimelike(self, nselect_method, dtype): # GH 21426 # use int64 bounds and +1 to min_val since true minimum is NaT # (include min_val/NaT at end to maintain same expected_idxr) - min_val, max_val = np.iinfo('int64').min, np.iinfo('int64').max + dtype_info = np.iinfo('int64') + min_val, max_val = dtype_info.min, dtype_info.max vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] - self._check_nselect_boundary(vals, dtype, method) + assert_check_nselect_boundary(vals, dtype, nselect_method) class TestCategoricalSeriesAnalytics(object):