diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 83a70aa34fccf..9fa55964de1c1 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -246,6 +246,7 @@ Bug Fixes - Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`) - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`) +- Bug in ``sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6bcd3776867b6..706cc9b0f026b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -343,7 +343,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): table = hash_klass(size_hint or len(vals)) uniques = vec_klass() - labels = table.get_labels(vals, uniques, 0, na_sentinel, True) + check_nulls = not is_integer_dtype(values) + labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls) labels = _ensure_platform_int(labels) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index b7a38e9e13ebd..579a4bf5d54d5 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -6,7 +6,7 @@ from pandas.compat import lrange from pandas import (DataFrame, Series, MultiIndex, Timestamp, - date_range) + date_range, NaT) from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -491,3 +491,49 @@ def test_frame_column_inplace_sort_exception(self): cp = s.copy() cp.sort_values() # it works! + + def test_sort_nat_values_in_int_column(self): + + # GH 14922: "sorting with large float and multiple columns incorrect" + + # cause was that the int64 value NaT was considered as "na". Which is + # only correct for datetime64 columns. + + int_values = (2, int(NaT)) + float_values = (2.0, -1.797693e308) + + df = DataFrame(dict(int=int_values, float=float_values), + columns=["int", "float"]) + + df_reversed = DataFrame(dict(int=int_values[::-1], + float=float_values[::-1]), + columns=["int", "float"], + index=[1, 0]) + + # NaT is not a "na" for int64 columns, so na_position must not + # influence the result: + df_sorted = df.sort_values(["int", "float"], na_position="last") + assert_frame_equal(df_sorted, df_reversed) + + df_sorted = df.sort_values(["int", "float"], na_position="first") + assert_frame_equal(df_sorted, df_reversed) + + # reverse sorting order + df_sorted = df.sort_values(["int", "float"], ascending=False) + assert_frame_equal(df_sorted, df) + + # and now check if NaT is still considered as "na" for datetime64 + # columns: + df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT], + float=float_values), columns=["datetime", "float"]) + + df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")], + float=float_values[::-1]), + columns=["datetime", "float"], + index=[1, 0]) + + df_sorted = df.sort_values(["datetime", "float"], na_position="first") + assert_frame_equal(df_sorted, df_reversed) + + df_sorted = df.sort_values(["datetime", "float"], na_position="last") + assert_frame_equal(df_sorted, df_reversed)