diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index dcda11a9fd3b2..b010fcc0f2d57 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -224,6 +224,7 @@ Deprecations ^^^^^^^^^^^^ - The method name ``Index.sym_diff()`` is deprecated and can be replaced by ``Index.symmetric_difference()`` (:issue:`12591`) +- The method name ``Categorical.sort()`` is deprecated in favor of ``Categorical.sort_values()`` (:issue:`12882`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index bf5fbb95dbfaa..986f7ad55361a 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1157,30 +1157,76 @@ def argsort(self, ascending=True, **kwargs): return result def sort_values(self, inplace=False, ascending=True, na_position='last'): - """ Sorts the Category by category value returning a new Categorical by - default. + """ Sorts the Categorical by category value returning a new + Categorical by default. - Only ordered Categoricals can be sorted! - - Categorical.sort is the equivalent but sorts the Categorical inplace. + While an ordering is applied to the category values, sorting in this + context refers more to organizing and grouping together based on + matching category values. Thus, this function can be called on an + unordered Categorical instance unlike the functions 'Categorical.min' + and 'Categorical.max'. Parameters ---------- inplace : boolean, default False Do operation in place. ascending : boolean, default True - Sort ascending. Passing False sorts descending + Order ascending. Passing False orders descending. The + ordering parameter provides the method by which the + category values are organized. na_position : {'first', 'last'} (optional, default='last') 'first' puts NaNs at the beginning 'last' puts NaNs at the end Returns ------- - y : Category or None + y : Categorical or None See Also -------- - Category.sort + Categorical.sort + + Examples + -------- + >>> c = pd.Categorical([1, 2, 2, 1, 5]) + >>> c + [1, 2, 2, 1, 5] + Categories (3, int64): [1, 2, 5] + >>> c.sort_values() + [1, 1, 2, 2, 5] + Categories (3, int64): [1, 2, 5] + >>> c.sort_values(ascending=False) + [5, 2, 2, 1, 1] + Categories (3, int64): [1, 2, 5] + + Inplace sorting can be done as well: + + >>> c.sort_values(inplace=True) + >>> c + [1, 1, 2, 2, 5] + Categories (3, int64): [1, 2, 5] + >>> + >>> c = pd.Categorical([1, 2, 2, 1, 5]) + + 'sort_values' behaviour with NaNs. Note that 'na_position' + is independent of the 'ascending' parameter: + + >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) + >>> c + [NaN, 2.0, 2.0, NaN, 5.0] + Categories (2, int64): [2, 5] + >>> c.sort_values() + [2.0, 2.0, 5.0, NaN, NaN] + Categories (2, int64): [2, 5] + >>> c.sort_values(ascending=False) + [5.0, 2.0, 2.0, NaN, NaN] + Categories (2, int64): [2, 5] + >>> c.sort_values(na_position='first') + [NaN, NaN, 2.0, 2.0, 5.0] + Categories (2, int64): [2, 5] + >>> c.sort_values(ascending=False, na_position='first') + [NaN, NaN, 5.0, 2.0, 2.0] + Categories (2, int64): [2, 5] """ if na_position not in ['last', 'first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) @@ -1193,13 +1239,13 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): na_mask = (codes == -1) if na_mask.any(): n_nans = len(codes[na_mask]) - if na_position == "first" and not ascending: + if na_position == "first": # in this case sort to the front new_codes = codes.copy() new_codes[0:n_nans] = -1 new_codes[n_nans:] = codes[~na_mask] codes = new_codes - elif na_position == "last" and not ascending: + elif na_position == "last": # ... and to the end new_codes = codes.copy() pos = len(codes) - n_nans @@ -1215,32 +1261,12 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): def order(self, inplace=False, ascending=True, na_position='last'): """ - DEPRECATED: use :meth:`Categorical.sort_values` - - Sorts the Category by category value returning a new Categorical by - default. - - Only ordered Categoricals can be sorted! - - Categorical.sort is the equivalent but sorts the Categorical inplace. - - Parameters - ---------- - inplace : boolean, default False - Do operation in place. - ascending : boolean, default True - Sort ascending. Passing False sorts descending - na_position : {'first', 'last'} (optional, default='last') - 'first' puts NaNs at the beginning - 'last' puts NaNs at the end - - Returns - ------- - y : Category or None + DEPRECATED: use :meth:`Categorical.sort_values`. That function + is entirely equivalent to this one. See Also -------- - Category.sort + Categorical.sort_values """ warn("order is deprecated, use sort_values(...)", FutureWarning, stacklevel=2) @@ -1248,30 +1274,18 @@ def order(self, inplace=False, ascending=True, na_position='last'): na_position=na_position) def sort(self, inplace=True, ascending=True, na_position='last'): - """ Sorts the Category inplace by category value. - - Only ordered Categoricals can be sorted! - - Catgorical.order is the equivalent but returns a new Categorical. - - Parameters - ---------- - ascending : boolean, default True - Sort ascending. Passing False sorts descending - inplace : boolean, default False - Do operation in place. - na_position : {'first', 'last'} (optional, default='last') - 'first' puts NaNs at the beginning - 'last' puts NaNs at the end - - Returns - ------- - y : Category or None + """ + DEPRECATED: use :meth:`Categorical.sort_values`. That function + is just like this one, except that a new Categorical is returned + by default, so make sure to pass in 'inplace=True' to get + inplace sorting. See Also -------- - Category.sort_values + Categorical.sort_values """ + warn("sort is deprecated, use sort_values(...)", FutureWarning, + stacklevel=2) return self.sort_values(inplace=inplace, ascending=ascending, na_position=na_position) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 607e6ae04148e..a0e6241383289 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1277,12 +1277,11 @@ def test_mode(self): exp = Categorical([4], categories=[5, 4, 3, 2, 1], ordered=True) self.assertTrue(res.equals(exp)) - def test_sort(self): + def test_sort_values(self): # unordered cats are sortable cat = Categorical(["a", "b", "b", "a"], ordered=False) cat.sort_values() - cat.sort() cat = Categorical(["a", "c", "b", "d"], ordered=True) @@ -1303,10 +1302,62 @@ def test_sort(self): # sort (inplace order) cat1 = cat.copy() - cat1.sort() + cat1.sort_values(inplace=True) exp = np.array(["a", "b", "c", "d"], dtype=object) self.assert_numpy_array_equal(cat1.__array__(), exp) + # reverse + cat = Categorical(["a", "c", "c", "b", "d"], ordered=True) + res = cat.sort_values(ascending=False) + exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object) + exp_categories = np.array(["a", "b", "c", "d"], dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.categories, exp_categories) + + def test_sort_values_na_position(self): + # see gh-12882 + cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True) + exp_categories = np.array([2, 5]) + + exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) + res = cat.sort_values() # default arguments + self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_numpy_array_equal(res.categories, exp_categories) + + exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0]) + res = cat.sort_values(ascending=True, na_position='first') + self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_numpy_array_equal(res.categories, exp_categories) + + exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0]) + res = cat.sort_values(ascending=False, na_position='first') + self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_numpy_array_equal(res.categories, exp_categories) + + exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) + res = cat.sort_values(ascending=True, na_position='last') + self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_numpy_array_equal(res.categories, exp_categories) + + exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan]) + res = cat.sort_values(ascending=False, na_position='last') + self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_numpy_array_equal(res.categories, exp_categories) + + cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) + res = cat.sort_values(ascending=False, na_position='last') + exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object) + exp_categories = np.array(["a", "b", "c", "d"], dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.categories, exp_categories) + + cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) + res = cat.sort_values(ascending=False, na_position='first') + exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object) + exp_categories = np.array(["a", "b", "c", "d"], dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.categories, exp_categories) + def test_slicing_directly(self): cat = Categorical(["a", "b", "c", "d", "a", "b", "c"]) sliced = cat[3] @@ -2951,14 +3002,16 @@ def test_count(self): result = s.count() self.assertEqual(result, 2) - def test_sort(self): + def test_sort_values(self): c = Categorical(["a", "b", "b", "a"], ordered=False) - cat = Series(c) + cat = Series(c.copy()) - # 9816 deprecated - with tm.assert_produces_warning(FutureWarning): - c.order() + # 'order' was deprecated in gh-10726 + # 'sort' was deprecated in gh-12882 + for func in ('order', 'sort'): + with tm.assert_produces_warning(FutureWarning): + getattr(c, func)() # sort in the categories order expected = Series( @@ -3024,44 +3077,6 @@ def test_sort(self): expected = df.iloc[[2, 1, 5, 4, 3, 0]] tm.assert_frame_equal(result, expected) - # reverse - cat = Categorical(["a", "c", "c", "b", "d"], ordered=True) - res = cat.sort_values(ascending=False) - exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) - self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) - - # some NaN positions - - cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) - res = cat.sort_values(ascending=False, na_position='last') - exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) - self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) - - cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) - res = cat.sort_values(ascending=False, na_position='first') - exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) - self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) - - cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) - res = cat.sort_values(ascending=False, na_position='first') - exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) - self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) - - cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) - res = cat.sort_values(ascending=False, na_position='last') - exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) - self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) - def test_slicing(self): cat = Series(Categorical([1, 2, 3, 4])) reversed = cat[::-1]