From 1afdbb8a27f0643a9c679b17f36fb8663d12ff3c Mon Sep 17 00:00:00 2001 From: Uwe Date: Wed, 21 Dec 2016 18:29:57 +0100 Subject: [PATCH 1/7] Fix GH 14922 having the int equivalent of NaT in an int64 column caused wrong sorting because this special value was considered as "missing value". --- pandas/core/algorithms.py | 3 ++- pandas/tests/frame/test_sorting.py | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6bcd3776867b6..706cc9b0f026b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -343,7 +343,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): table = hash_klass(size_hint or len(vals)) uniques = vec_klass() - labels = table.get_labels(vals, uniques, 0, na_sentinel, True) + check_nulls = not is_integer_dtype(values) + labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls) labels = _ensure_platform_int(labels) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index b7a38e9e13ebd..e6b11e5008b42 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -6,11 +6,12 @@ from pandas.compat import lrange from pandas import (DataFrame, Series, MultiIndex, Timestamp, - date_range) + date_range, NaT) from pandas.util.testing import (assert_series_equal, assert_frame_equal, - assertRaisesRegexp) + assertRaisesRegexp, + is_sorted) import pandas.util.testing as tm @@ -491,3 +492,18 @@ def test_frame_column_inplace_sort_exception(self): cp = s.copy() cp.sort_values() # it works! + + def test_sort_nat_values_in_int_column(self): + + # GH 14922, sorting with large float and multiple columns incorrect + int_values = (2, int(NaT)) + float_values = (2.0, -1.797693e308) + + df = DataFrame(dict(int=int_values, float=float_values), + columns=["int", "float"]) + + df_sorted = df.sort_values(["int", "float"]) + df_expected = DataFrame(dict(int=int_values[::-1], float=float_values[::-1]), + columns=["int", "float"], index=[1, 0]) + + assert_frame_equal(df_sorted, df_expected) From 03699c67a5953882e025ea6eb2607ea3053c9337 Mon Sep 17 00:00:00 2001 From: Uwe Date: Wed, 21 Dec 2016 18:29:57 +0100 Subject: [PATCH 2/7] Fix GH 14922 having the int equivalent of NaT in an int64 column caused wrong sorting because this special value was considered as "missing value". --- pandas/core/algorithms.py | 3 ++- pandas/tests/frame/test_sorting.py | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6bcd3776867b6..706cc9b0f026b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -343,7 +343,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): table = hash_klass(size_hint or len(vals)) uniques = vec_klass() - labels = table.get_labels(vals, uniques, 0, na_sentinel, True) + check_nulls = not is_integer_dtype(values) + labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls) labels = _ensure_platform_int(labels) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index b7a38e9e13ebd..b29e0c8bdd5ba 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -6,7 +6,7 @@ from pandas.compat import lrange from pandas import (DataFrame, Series, MultiIndex, Timestamp, - date_range) + date_range, NaT) from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -491,3 +491,18 @@ def test_frame_column_inplace_sort_exception(self): cp = s.copy() cp.sort_values() # it works! + + def test_sort_nat_values_in_int_column(self): + + # GH 14922, sorting with large float and multiple columns incorrect + int_values = (2, int(NaT)) + float_values = (2.0, -1.797693e308) + + df = DataFrame(dict(int=int_values, float=float_values), + columns=["int", "float"]) + + df_sorted = df.sort_values(["int", "float"]) + df_expected = DataFrame(dict(int=int_values[::-1], float=float_values[::-1]), + columns=["int", "float"], index=[1, 0]) + + assert_frame_equal(df_sorted, df_expected) From 21e610c5b9646b031e825ce768987cad129aae6d Mon Sep 17 00:00:00 2001 From: Uwe Date: Thu, 22 Dec 2016 11:34:05 +0100 Subject: [PATCH 3/7] extended tests + minor cleanup --- pandas/tests/frame/test_sorting.py | 51 ++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index e6b11e5008b42..b3f25d141632c 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -10,8 +10,7 @@ from pandas.util.testing import (assert_series_equal, assert_frame_equal, - assertRaisesRegexp, - is_sorted) + assertRaisesRegexp) import pandas.util.testing as tm @@ -495,15 +494,53 @@ def test_frame_column_inplace_sort_exception(self): def test_sort_nat_values_in_int_column(self): - # GH 14922, sorting with large float and multiple columns incorrect + # GH 14922: "sorting with large float and multiple columns incorrect" + + # cause was that the int64 value NaT was considered as "na". Which is + # only correct for datetime64 columns. + int_values = (2, int(NaT)) float_values = (2.0, -1.797693e308) df = DataFrame(dict(int=int_values, float=float_values), columns=["int", "float"]) - df_sorted = df.sort_values(["int", "float"]) - df_expected = DataFrame(dict(int=int_values[::-1], float=float_values[::-1]), - columns=["int", "float"], index=[1, 0]) + df_reversed = DataFrame(dict(int=int_values[::-1], + float=float_values[::-1]), + columns=["int", "float"], + index=[1, 0]) + + # NaT is not a "na" for int64 columns, so na_position must not + # influence the result: + df_sorted = df.sort_values(["int", "float"], na_position="last") + assert_frame_equal(df_sorted, df_reversed) + + df_sorted = df.sort_values(["int", "float"], na_position="first") + assert_frame_equal(df_sorted, df_reversed) + + # reverse sorting order + df_sorted = df.sort_values(["int", "float"], ascending=False) + assert_frame_equal(df_sorted, df) + + # and now check if NaT is still considered as "na" for datetime64 + # columns: + df = DataFrame(dict(int=int_values, float=float_values), + columns=["int", "float"]) + + df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT], + float=float_values), columns=["datetime", "float"]) + + # check if the dtype is datetime64[ns]: + assert df["datetime"].dtypes == np.dtype("datetime64[ns]"),\ + "this test function is not reliable anymore" + + df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")], + float=float_values[::-1]), + columns=["datetime", "float"], + index=[1, 0]) + + df_sorted = df.sort_values(["datetime", "float"], na_position="first") + assert_frame_equal(df_sorted, df_reversed) - assert_frame_equal(df_sorted, df_expected) + df_sorted = df.sort_values(["datetime", "float"], na_position="last") + assert_frame_equal(df_sorted, df_reversed) From 04dcbe8b35ce6708b3a4f176165a973277fe8136 Mon Sep 17 00:00:00 2001 From: Uwe Date: Thu, 22 Dec 2016 13:03:50 +0100 Subject: [PATCH 4/7] further test cleanup --- pandas/tests/frame/test_sorting.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index b3f25d141632c..a9193506e88ec 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -524,13 +524,9 @@ def test_sort_nat_values_in_int_column(self): # and now check if NaT is still considered as "na" for datetime64 # columns: - df = DataFrame(dict(int=int_values, float=float_values), - columns=["int", "float"]) - df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT], float=float_values), columns=["datetime", "float"]) - # check if the dtype is datetime64[ns]: assert df["datetime"].dtypes == np.dtype("datetime64[ns]"),\ "this test function is not reliable anymore" From 60cca5d3d823148f713566ebb09a631fcd510f1b Mon Sep 17 00:00:00 2001 From: Uwe Date: Thu, 22 Dec 2016 13:07:25 +0100 Subject: [PATCH 5/7] add fix of GH14922 to release notes for 0.20.0 --- doc/source/whatsnew/v0.20.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 83a70aa34fccf..329bc126e716f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -246,6 +246,7 @@ Bug Fixes - Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`) - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`) +- Bug in ``sort_values()`` when sorting by multiple columns where one colums is of type `int64` and contains `NaT` (:issue:`14922`) From 4f280269ac4011cf1dbf256920d38a97e1176e53 Mon Sep 17 00:00:00 2001 From: Uwe Date: Thu, 22 Dec 2016 13:21:00 +0100 Subject: [PATCH 6/7] fixed typo in whatsnew/v0.20.0.txt --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 329bc126e716f..9fa55964de1c1 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -246,7 +246,7 @@ Bug Fixes - Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`) - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`) -- Bug in ``sort_values()`` when sorting by multiple columns where one colums is of type `int64` and contains `NaT` (:issue:`14922`) +- Bug in ``sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`) From c244438e487d8fbfb3b055fdd7a29dbc8edbdf98 Mon Sep 17 00:00:00 2001 From: Uwe Date: Thu, 22 Dec 2016 13:49:57 +0100 Subject: [PATCH 7/7] further cleanup tests --- pandas/tests/frame/test_sorting.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index a9193506e88ec..579a4bf5d54d5 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -527,9 +527,6 @@ def test_sort_nat_values_in_int_column(self): df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT], float=float_values), columns=["datetime", "float"]) - assert df["datetime"].dtypes == np.dtype("datetime64[ns]"),\ - "this test function is not reliable anymore" - df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")], float=float_values[::-1]), columns=["datetime", "float"],