From 1afdbb8a27f0643a9c679b17f36fb8663d12ff3c Mon Sep 17 00:00:00 2001 From: Uwe Date: Wed, 21 Dec 2016 18:29:57 +0100 Subject: [PATCH] Fix GH 14922 having the int equivalent of NaT in an int64 column caused wrong sorting because this special value was considered as "missing value". --- pandas/core/algorithms.py | 3 ++- pandas/tests/frame/test_sorting.py | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6bcd3776867b6..706cc9b0f026b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -343,7 +343,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): table = hash_klass(size_hint or len(vals)) uniques = vec_klass() - labels = table.get_labels(vals, uniques, 0, na_sentinel, True) + check_nulls = not is_integer_dtype(values) + labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls) labels = _ensure_platform_int(labels) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index b7a38e9e13ebd..e6b11e5008b42 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -6,11 +6,12 @@ from pandas.compat import lrange from pandas import (DataFrame, Series, MultiIndex, Timestamp, - date_range) + date_range, NaT) from pandas.util.testing import (assert_series_equal, assert_frame_equal, - assertRaisesRegexp) + assertRaisesRegexp, + is_sorted) import pandas.util.testing as tm @@ -491,3 +492,18 @@ def test_frame_column_inplace_sort_exception(self): cp = s.copy() cp.sort_values() # it works! + + def test_sort_nat_values_in_int_column(self): + + # GH 14922, sorting with large float and multiple columns incorrect + int_values = (2, int(NaT)) + float_values = (2.0, -1.797693e308) + + df = DataFrame(dict(int=int_values, float=float_values), + columns=["int", "float"]) + + df_sorted = df.sort_values(["int", "float"]) + df_expected = DataFrame(dict(int=int_values[::-1], float=float_values[::-1]), + columns=["int", "float"], index=[1, 0]) + + assert_frame_equal(df_sorted, df_expected)