Skip to content

Commit

Permalink
Backport PR #46656: BUG: df.nsmallest get wrong results when NaN in t…
Browse files Browse the repository at this point in the history
…he sorting column (#46748)

Co-authored-by: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com>
  • Loading branch information
simonjayhawkins and GYHHAHA authored Apr 12, 2022
1 parent dfbc1dc commit 2886388
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ including other versions of pandas.

Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`)
- Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`)
-

Expand Down
6 changes: 5 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1216,7 +1216,6 @@ def compute(self, method: str) -> Series:
arr = arr[::-1]

nbase = n
findex = len(self.obj)
narr = len(arr)
n = min(n, narr)

Expand All @@ -1229,6 +1228,11 @@ def compute(self, method: str) -> Series:
if self.keep != "all":
inds = inds[:n]
findex = nbase
else:
if len(inds) < nbase and len(nan_index) + len(inds) >= nbase:
findex = len(nan_index) + len(inds)
else:
findex = len(inds)

if self.keep == "last":
# reverse indices
Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/frame/methods/test_nlargest.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,24 @@ def test_nlargest_nan(self):
result = df.nlargest(5, 0)
expected = df.sort_values(0, ascending=False).head(5)
tm.assert_frame_equal(result, expected)

def test_nsmallest_nan_after_n_element(self):
# GH#46589
df = pd.DataFrame(
{
"a": [1, 2, 3, 4, 5, None, 7],
"b": [7, 6, 5, 4, 3, 2, 1],
"c": [1, 1, 2, 2, 3, 3, 3],
},
index=range(7),
)
result = df.nsmallest(5, columns=["a", "b"])
expected = pd.DataFrame(
{
"a": [1, 2, 3, 4, 5],
"b": [7, 6, 5, 4, 3],
"c": [1, 1, 2, 2, 3],
},
index=range(5),
).astype({"a": "float"})
tm.assert_frame_equal(result, expected)
12 changes: 12 additions & 0 deletions pandas/tests/series/methods/test_nlargest.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,3 +231,15 @@ def test_nlargest_nullable(self, any_numeric_ea_dtype):
.astype(dtype)
)
tm.assert_series_equal(result, expected)

def test_nsmallest_nan_when_keep_is_all(self):
# GH#46589
s = Series([1, 2, 3, 3, 3, None])
result = s.nsmallest(3, keep="all")
expected = Series([1.0, 2.0, 3.0, 3.0, 3.0])
tm.assert_series_equal(result, expected)

s = Series([1, 2, None, None, None])
result = s.nsmallest(3, keep="all")
expected = Series([1, 2, None, None, None])
tm.assert_series_equal(result, expected)

0 comments on commit 2886388

Please sign in to comment.