From f324a9c02d7e09d963622ff8136d10bfb0345b41 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 12 Apr 2016 15:51:24 +0100 Subject: [PATCH] BUG, DOC, DEP: Patch and Align Categorical's Sorting API Clarifies the meaning of 'sort' in the context of Categorical to mean 'organization' rather than 'order', as it is possible to call this method (as well as 'sort_values') when the Categorical is unordered. Also patches a bug in 'Categorical.sort_values' in which 'na_position' was not being respected when 'ascending' was set to 'True'. This commit aligns the behaviour with that of Series. Finally, this commit deprecates 'sort' in favor of 'sort_values,' which is in alignment with the Series API as well. Closes gh-12785. --- doc/source/whatsnew/v0.18.1.txt | 1 + pandas/core/categorical.py | 120 +++++++++++++++++-------------- pandas/tests/test_categorical.py | 107 +++++++++++++++------------ 3 files changed, 129 insertions(+), 99 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index dcda11a9fd3b2..b010fcc0f2d57 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -224,6 +224,7 @@ Deprecations ^^^^^^^^^^^^ - The method name ``Index.sym_diff()`` is deprecated and can be replaced by ``Index.symmetric_difference()`` (:issue:`12591`) +- The method name ``Categorical.sort()`` is deprecated in favor of ``Categorical.sort_values()`` (:issue:`12882`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index bf5fbb95dbfaa..986f7ad55361a 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1157,30 +1157,76 @@ def argsort(self, ascending=True, **kwargs): return result def sort_values(self, inplace=False, ascending=True, na_position='last'): - """ Sorts the Category by category value returning a new Categorical by - default. + """ Sorts the Categorical by category value returning a new + Categorical by default. - Only ordered Categoricals can be sorted! - - Categorical.sort is the equivalent but sorts the Categorical inplace. + While an ordering is applied to the category values, sorting in this + context refers more to organizing and grouping together based on + matching category values. Thus, this function can be called on an + unordered Categorical instance unlike the functions 'Categorical.min' + and 'Categorical.max'. Parameters ---------- inplace : boolean, default False Do operation in place. ascending : boolean, default True - Sort ascending. Passing False sorts descending + Order ascending. Passing False orders descending. The + ordering parameter provides the method by which the + category values are organized. na_position : {'first', 'last'} (optional, default='last') 'first' puts NaNs at the beginning 'last' puts NaNs at the end Returns ------- - y : Category or None + y : Categorical or None See Also -------- - Category.sort + Categorical.sort + + Examples + -------- + >>> c = pd.Categorical([1, 2, 2, 1, 5]) + >>> c + [1, 2, 2, 1, 5] + Categories (3, int64): [1, 2, 5] + >>> c.sort_values() + [1, 1, 2, 2, 5] + Categories (3, int64): [1, 2, 5] + >>> c.sort_values(ascending=False) + [5, 2, 2, 1, 1] + Categories (3, int64): [1, 2, 5] + + Inplace sorting can be done as well: + + >>> c.sort_values(inplace=True) + >>> c + [1, 1, 2, 2, 5] + Categories (3, int64): [1, 2, 5] + >>> + >>> c = pd.Categorical([1, 2, 2, 1, 5]) + + 'sort_values' behaviour with NaNs. Note that 'na_position' + is independent of the 'ascending' parameter: + + >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) + >>> c + [NaN, 2.0, 2.0, NaN, 5.0] + Categories (2, int64): [2, 5] + >>> c.sort_values() + [2.0, 2.0, 5.0, NaN, NaN] + Categories (2, int64): [2, 5] + >>> c.sort_values(ascending=False) + [5.0, 2.0, 2.0, NaN, NaN] + Categories (2, int64): [2, 5] + >>> c.sort_values(na_position='first') + [NaN, NaN, 2.0, 2.0, 5.0] + Categories (2, int64): [2, 5] + >>> c.sort_values(ascending=False, na_position='first') + [NaN, NaN, 5.0, 2.0, 2.0] + Categories (2, int64): [2, 5] """ if na_position not in ['last', 'first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) @@ -1193,13 +1239,13 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): na_mask = (codes == -1) if na_mask.any(): n_nans = len(codes[na_mask]) - if na_position == "first" and not ascending: + if na_position == "first": # in this case sort to the front new_codes = codes.copy() new_codes[0:n_nans] = -1 new_codes[n_nans:] = codes[~na_mask] codes = new_codes - elif na_position == "last" and not ascending: + elif na_position == "last": # ... and to the end new_codes = codes.copy() pos = len(codes) - n_nans @@ -1215,32 +1261,12 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): def order(self, inplace=False, ascending=True, na_position='last'): """ - DEPRECATED: use :meth:`Categorical.sort_values` - - Sorts the Category by category value returning a new Categorical by - default. - - Only ordered Categoricals can be sorted! - - Categorical.sort is the equivalent but sorts the Categorical inplace. - - Parameters - ---------- - inplace : boolean, default False - Do operation in place. - ascending : boolean, default True - Sort ascending. Passing False sorts descending - na_position : {'first', 'last'} (optional, default='last') - 'first' puts NaNs at the beginning - 'last' puts NaNs at the end - - Returns - ------- - y : Category or None + DEPRECATED: use :meth:`Categorical.sort_values`. That function + is entirely equivalent to this one. See Also -------- - Category.sort + Categorical.sort_values """ warn("order is deprecated, use sort_values(...)", FutureWarning, stacklevel=2) @@ -1248,30 +1274,18 @@ def order(self, inplace=False, ascending=True, na_position='last'): na_position=na_position) def sort(self, inplace=True, ascending=True, na_position='last'): - """ Sorts the Category inplace by category value. - - Only ordered Categoricals can be sorted! - - Catgorical.order is the equivalent but returns a new Categorical. - - Parameters - ---------- - ascending : boolean, default True - Sort ascending. Passing False sorts descending - inplace : boolean, default False - Do operation in place. - na_position : {'first', 'last'} (optional, default='last') - 'first' puts NaNs at the beginning - 'last' puts NaNs at the end - - Returns - ------- - y : Category or None + """ + DEPRECATED: use :meth:`Categorical.sort_values`. That function + is just like this one, except that a new Categorical is returned + by default, so make sure to pass in 'inplace=True' to get + inplace sorting. See Also -------- - Category.sort_values + Categorical.sort_values """ + warn("sort is deprecated, use sort_values(...)", FutureWarning, + stacklevel=2) return self.sort_values(inplace=inplace, ascending=ascending, na_position=na_position) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 607e6ae04148e..a0e6241383289 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1277,12 +1277,11 @@ def test_mode(self): exp = Categorical([4], categories=[5, 4, 3, 2, 1], ordered=True) self.assertTrue(res.equals(exp)) - def test_sort(self): + def test_sort_values(self): # unordered cats are sortable cat = Categorical(["a", "b", "b", "a"], ordered=False) cat.sort_values() - cat.sort() cat = Categorical(["a", "c", "b", "d"], ordered=True) @@ -1303,10 +1302,62 @@ def test_sort(self): # sort (inplace order) cat1 = cat.copy() - cat1.sort() + cat1.sort_values(inplace=True) exp = np.array(["a", "b", "c", "d"], dtype=object) self.assert_numpy_array_equal(cat1.__array__(), exp) + # reverse + cat = Categorical(["a", "c", "c", "b", "d"], ordered=True) + res = cat.sort_values(ascending=False) + exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object) + exp_categories = np.array(["a", "b", "c", "d"], dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.categories, exp_categories) + + def test_sort_values_na_position(self): + # see gh-12882 + cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True) + exp_categories = np.array([2, 5]) + + exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) + res = cat.sort_values() # default arguments + self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_numpy_array_equal(res.categories, exp_categories) + + exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0]) + res = cat.sort_values(ascending=True, na_position='first') + self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_numpy_array_equal(res.categories, exp_categories) + + exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0]) + res = cat.sort_values(ascending=False, na_position='first') + self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_numpy_array_equal(res.categories, exp_categories) + + exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) + res = cat.sort_values(ascending=True, na_position='last') + self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_numpy_array_equal(res.categories, exp_categories) + + exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan]) + res = cat.sort_values(ascending=False, na_position='last') + self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_numpy_array_equal(res.categories, exp_categories) + + cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) + res = cat.sort_values(ascending=False, na_position='last') + exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object) + exp_categories = np.array(["a", "b", "c", "d"], dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.categories, exp_categories) + + cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) + res = cat.sort_values(ascending=False, na_position='first') + exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object) + exp_categories = np.array(["a", "b", "c", "d"], dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.categories, exp_categories) + def test_slicing_directly(self): cat = Categorical(["a", "b", "c", "d", "a", "b", "c"]) sliced = cat[3] @@ -2951,14 +3002,16 @@ def test_count(self): result = s.count() self.assertEqual(result, 2) - def test_sort(self): + def test_sort_values(self): c = Categorical(["a", "b", "b", "a"], ordered=False) - cat = Series(c) + cat = Series(c.copy()) - # 9816 deprecated - with tm.assert_produces_warning(FutureWarning): - c.order() + # 'order' was deprecated in gh-10726 + # 'sort' was deprecated in gh-12882 + for func in ('order', 'sort'): + with tm.assert_produces_warning(FutureWarning): + getattr(c, func)() # sort in the categories order expected = Series( @@ -3024,44 +3077,6 @@ def test_sort(self): expected = df.iloc[[2, 1, 5, 4, 3, 0]] tm.assert_frame_equal(result, expected) - # reverse - cat = Categorical(["a", "c", "c", "b", "d"], ordered=True) - res = cat.sort_values(ascending=False) - exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) - self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) - - # some NaN positions - - cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) - res = cat.sort_values(ascending=False, na_position='last') - exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) - self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) - - cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) - res = cat.sort_values(ascending=False, na_position='first') - exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) - self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) - - cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) - res = cat.sort_values(ascending=False, na_position='first') - exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) - self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) - - cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) - res = cat.sort_values(ascending=False, na_position='last') - exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) - self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) - def test_slicing(self): cat = Series(Categorical([1, 2, 3, 4])) reversed = cat[::-1]