From 80280ec576ab8077ba0cc6664c6a358f0b1e671e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Mar 2017 10:45:17 -0400 Subject: [PATCH] DEPR: Drop support for NaN categories in Categorical Deprecated in 0.17.0. xref #10748 xref #13648 Author: Jeff Reback Author: gfyoung Closes #15806 from gfyoung/categories-nan-drop and squashes the following commits: 318175b [Jeff Reback] TST: test pd.NaT with correct dtype 4dce349 [gfyoung] Drop support for NaN categories in Categorical --- doc/source/categorical.rst | 9 ++ doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/categorical.py | 13 +- pandas/tests/indexes/test_category.py | 14 -- pandas/tests/test_categorical.py | 207 ++++---------------------- 5 files changed, 41 insertions(+), 203 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 2203737ecd7b5..411f973e9a71f 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -230,6 +230,15 @@ Categories must be unique or a `ValueError` is raised: except ValueError as e: print("ValueError: " + str(e)) +Categories must also not be ``NaN`` or a `ValueError` is raised: + +.. ipython:: python + + try: + s.cat.categories = [1,2,np.nan] + except ValueError as e: + print("ValueError: " + str(e)) + Appending new categories ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 358d66653fb9c..a0b2b47c4bac3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -816,6 +816,7 @@ Removal of prior version deprecations/changes in favor of ``iloc`` and ``iat`` as explained :ref:`here ` (:issue:`10711`). - The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`) - The ``Categorical`` constructor has dropped the ``name`` parameter (:issue:`10632`) +- ``Categorical`` has dropped support for ``NaN`` categories (:issue:`10748`) - The ``take_last`` parameter has been dropped from ``duplicated()``, ``drop_duplicates()``, ``nlargest()``, and ``nsmallest()`` methods (:issue:`10236`, :issue:`10792`, :issue:`10920`) - ``Series``, ``Index``, and ``DataFrame`` have dropped the ``sort`` and ``order`` methods (:issue:`10726`) - Where clauses in ``pytables`` are only accepted as strings and expressions types and not other data-types (:issue:`12027`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 0e58c18631588..632c24c33feb7 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -545,18 +545,11 @@ def _validate_categories(cls, categories, fastpath=False): if not fastpath: - # check properties of the categories - # we don't allow NaNs in the categories themselves - + # Categories cannot contain NaN. if categories.hasnans: - # NaNs in cats deprecated in 0.17 - # GH 10748 - msg = ('\nSetting NaNs in `categories` is deprecated and ' - 'will be removed in a future version of pandas.') - warn(msg, FutureWarning, stacklevel=3) - - # categories must be unique + raise ValueError('Categorial categories cannot be null') + # Categories must be unique. if not categories.is_unique: raise ValueError('Categorical categories must be unique') diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 64a0e71bd5ace..ef1be7e60e0e8 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -183,11 +183,6 @@ def test_contains(self): self.assertFalse(0 in ci) self.assertFalse(1 in ci) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - ci = CategoricalIndex( - list('aabbca'), categories=list('cabdef') + [np.nan]) - self.assertFalse(np.nan in ci) - ci = CategoricalIndex( list('aabbca') + [np.nan], categories=list('cabdef')) self.assertTrue(np.nan in ci) @@ -541,7 +536,6 @@ def test_ensure_copied_data(self): self.assertIs(_base(index.values), _base(result.values)) def test_equals_categorical(self): - ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], ordered=True) @@ -579,14 +573,6 @@ def test_equals_categorical(self): self.assertFalse(ci.equals(CategoricalIndex(list('aabca')))) self.assertTrue(ci.equals(ci.copy())) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - ci = CategoricalIndex(list('aabca'), - categories=['c', 'a', 'b', np.nan]) - self.assertFalse(ci.equals(list('aabca'))) - self.assertFalse(ci.equals(CategoricalIndex(list('aabca')))) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.assertTrue(ci.equals(ci.copy())) - ci = CategoricalIndex(list('aabca') + [np.nan], categories=['c', 'a', 'b']) self.assertFalse(ci.equals(list('aabca'))) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 479f0e4566b8d..8fd3c6324d48c 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # pylint: disable=E1101,E1103,W0232 +import pytest import sys from datetime import datetime from distutils.version import LooseVersion @@ -17,7 +18,8 @@ import pandas.compat as compat import pandas.util.testing as tm from pandas import (Categorical, Index, Series, DataFrame, PeriodIndex, - Timestamp, CategoricalIndex, isnull) + Timestamp, CategoricalIndex, DatetimeIndex, + isnull, NaT) from pandas.compat import range, lrange, u, PY3 from pandas.core.config import option_context @@ -160,12 +162,6 @@ def f(): self.assertRaises(ValueError, f) - def f(): - with tm.assert_produces_warning(FutureWarning): - Categorical([1, 2], [1, 2, np.nan, np.nan]) - - self.assertRaises(ValueError, f) - # The default should be unordered c1 = Categorical(["a", "b", "c", "a"]) self.assertFalse(c1.ordered) @@ -222,29 +218,12 @@ def f(): cat = pd.Categorical([np.nan, 1., 2., 3.]) self.assertTrue(is_float_dtype(cat.categories)) - # Deprecating NaNs in categoires (GH #10748) - # preserve int as far as possible by converting to object if NaN is in - # categories - with tm.assert_produces_warning(FutureWarning): - cat = pd.Categorical([np.nan, 1, 2, 3], - categories=[np.nan, 1, 2, 3]) - self.assertTrue(is_object_dtype(cat.categories)) - # This doesn't work -> this would probably need some kind of "remember # the original type" feature to try to cast the array interface result # to... # vals = np.asarray(cat[cat.notnull()]) # self.assertTrue(is_integer_dtype(vals)) - with tm.assert_produces_warning(FutureWarning): - cat = pd.Categorical([np.nan, "a", "b", "c"], - categories=[np.nan, "a", "b", "c"]) - self.assertTrue(is_object_dtype(cat.categories)) - # but don't do it for floats - with tm.assert_produces_warning(FutureWarning): - cat = pd.Categorical([np.nan, 1., 2., 3.], - categories=[np.nan, 1., 2., 3.]) - self.assertTrue(is_float_dtype(cat.categories)) # corner cases cat = pd.Categorical([1]) @@ -295,6 +274,22 @@ def f(): c = Categorical(np.array([], dtype='int64'), # noqa categories=[3, 2, 1], ordered=True) + def test_constructor_with_null(self): + + # Cannot have NaN in categories + with pytest.raises(ValueError): + pd.Categorical([np.nan, "a", "b", "c"], + categories=[np.nan, "a", "b", "c"]) + + with pytest.raises(ValueError): + pd.Categorical([None, "a", "b", "c"], + categories=[None, "a", "b", "c"]) + + with pytest.raises(ValueError): + pd.Categorical(DatetimeIndex(['nat', '20160101']), + categories=[NaT, Timestamp('20160101')]) + + def test_constructor_with_index(self): ci = CategoricalIndex(list('aabbca'), categories=list('cab')) tm.assert_categorical_equal(ci.values, Categorical(ci)) @@ -418,6 +413,12 @@ def f(): self.assertRaises(ValueError, f) + # NaN categories included + def f(): + Categorical.from_codes([0, 1, 2], ["a", "b", np.nan]) + + self.assertRaises(ValueError, f) + # too negative def f(): Categorical.from_codes([-2, 1, 2], ["a", "b", "c"]) @@ -649,30 +650,6 @@ def test_describe(self): name='categories')) tm.assert_frame_equal(desc, expected) - # NA as a category - with tm.assert_produces_warning(FutureWarning): - cat = pd.Categorical(["a", "c", "c", np.nan], - categories=["b", "a", "c", np.nan]) - result = cat.describe() - - expected = DataFrame([[0, 0], [1, 0.25], [2, 0.5], [1, 0.25]], - columns=['counts', 'freqs'], - index=pd.CategoricalIndex(['b', 'a', 'c', np.nan], - name='categories')) - tm.assert_frame_equal(result, expected, check_categorical=False) - - # NA as an unused category - with tm.assert_produces_warning(FutureWarning): - cat = pd.Categorical(["a", "c", "c"], - categories=["b", "a", "c", np.nan]) - result = cat.describe() - - exp_idx = pd.CategoricalIndex( - ['b', 'a', 'c', np.nan], name='categories') - expected = DataFrame([[0, 0], [1, 1 / 3.], [2, 2 / 3.], [0, 0]], - columns=['counts', 'freqs'], index=exp_idx) - tm.assert_frame_equal(result, expected, check_categorical=False) - def test_print(self): expected = ["[a, b, b, a, a, c, c, c]", "Categories (3, object): [a < b < c]"] @@ -1119,90 +1096,18 @@ def test_nan_handling(self): self.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8)) - # If categories have nan included, the code should point to that - # instead - with tm.assert_produces_warning(FutureWarning): - c = Categorical(["a", "b", np.nan, "a"], - categories=["a", "b", np.nan]) - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) - self.assert_numpy_array_equal(c._codes, - np.array([0, 1, 2, 0], dtype=np.int8)) - c[1] = np.nan - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) - self.assert_numpy_array_equal(c._codes, - np.array([0, 2, 2, 0], dtype=np.int8)) - - # Changing categories should also make the replaced category np.nan - c = Categorical(["a", "b", "c", "a"]) - with tm.assert_produces_warning(FutureWarning): - c.categories = ["a", "b", np.nan] # noqa - - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) - self.assert_numpy_array_equal(c._codes, - np.array([0, 1, 2, 0], dtype=np.int8)) - # Adding nan to categories should make assigned nan point to the # category! c = Categorical(["a", "b", np.nan, "a"]) self.assert_index_equal(c.categories, Index(["a", "b"])) self.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) - with tm.assert_produces_warning(FutureWarning): - c.set_categories(["a", "b", np.nan], rename=True, inplace=True) - - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) - self.assert_numpy_array_equal(c._codes, - np.array([0, 1, -1, 0], dtype=np.int8)) - c[1] = np.nan - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) - self.assert_numpy_array_equal(c._codes, - np.array([0, 2, -1, 0], dtype=np.int8)) - - # Remove null categories (GH 10156) - cases = [([1.0, 2.0, np.nan], [1.0, 2.0]), - (['a', 'b', None], ['a', 'b']), - ([pd.Timestamp('2012-05-01'), pd.NaT], - [pd.Timestamp('2012-05-01')])] - - null_values = [np.nan, None, pd.NaT] - - for with_null, without in cases: - with tm.assert_produces_warning(FutureWarning): - base = Categorical([], with_null) - expected = Categorical([], without) - - for nullval in null_values: - result = base.remove_categories(nullval) - self.assert_categorical_equal(result, expected) - - # Different null values are indistinguishable - for i, j in [(0, 1), (0, 2), (1, 2)]: - nulls = [null_values[i], null_values[j]] - - def f(): - with tm.assert_produces_warning(FutureWarning): - Categorical([], categories=nulls) - - self.assertRaises(ValueError, f) def test_isnull(self): exp = np.array([False, False, True]) c = Categorical(["a", "b", np.nan]) res = c.isnull() - self.assert_numpy_array_equal(res, exp) - with tm.assert_produces_warning(FutureWarning): - c = Categorical(["a", "b", np.nan], categories=["a", "b", np.nan]) - res = c.isnull() - self.assert_numpy_array_equal(res, exp) - - # test both nan in categories and as -1 - exp = np.array([True, False, True]) - c = Categorical(["a", "b", np.nan]) - with tm.assert_produces_warning(FutureWarning): - c.set_categories(["a", "b", np.nan], rename=True, inplace=True) - c[0] = np.nan - res = c.isnull() self.assert_numpy_array_equal(res, exp) def test_codes_immutable(self): @@ -1487,45 +1392,10 @@ def test_slicing_directly(self): def test_set_item_nan(self): cat = pd.Categorical([1, 2, 3]) - exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3]) cat[1] = np.nan - tm.assert_categorical_equal(cat, exp) - # if nan in categories, the proper code should be set! - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) - cat[1] = np.nan - exp = np.array([0, 3, 2, -1], dtype=np.int8) - self.assert_numpy_array_equal(cat.codes, exp) - - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) - cat[1:3] = np.nan - exp = np.array([0, 3, 3, -1], dtype=np.int8) - self.assert_numpy_array_equal(cat.codes, exp) - - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) - cat[1:3] = [np.nan, 1] - exp = np.array([0, 3, 0, -1], dtype=np.int8) - self.assert_numpy_array_equal(cat.codes, exp) - - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) - cat[1:3] = [np.nan, np.nan] - exp = np.array([0, 3, 3, -1], dtype=np.int8) - self.assert_numpy_array_equal(cat.codes, exp) - - cat = pd.Categorical([1, 2, np.nan, 3], categories=[1, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) - cat[pd.isnull(cat)] = np.nan - exp = np.array([0, 1, 3, 2], dtype=np.int8) - self.assert_numpy_array_equal(cat.codes, exp) + exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3]) + tm.assert_categorical_equal(cat, exp) def test_shift(self): # GH 9416 @@ -2026,33 +1896,12 @@ def test_sideeffects_free(self): def test_nan_handling(self): - # Nans are represented as -1 in labels + # NaNs are represented as -1 in labels s = Series(Categorical(["a", "b", np.nan, "a"])) self.assert_index_equal(s.cat.categories, Index(["a", "b"])) self.assert_numpy_array_equal(s.values.codes, np.array([0, 1, -1, 0], dtype=np.int8)) - # If categories have nan included, the label should point to that - # instead - with tm.assert_produces_warning(FutureWarning): - s2 = Series(Categorical(["a", "b", np.nan, "a"], - categories=["a", "b", np.nan])) - - exp_cat = Index(["a", "b", np.nan]) - self.assert_index_equal(s2.cat.categories, exp_cat) - self.assert_numpy_array_equal(s2.values.codes, - np.array([0, 1, 2, 0], dtype=np.int8)) - - # Changing categories should also make the replaced category np.nan - s3 = Series(Categorical(["a", "b", "c", "a"])) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s3.cat.categories = ["a", "b", np.nan] - - exp_cat = Index(["a", "b", np.nan]) - self.assert_index_equal(s3.cat.categories, exp_cat) - self.assert_numpy_array_equal(s3.values.codes, - np.array([0, 1, 2, 0], dtype=np.int8)) - def test_cat_accessor(self): s = Series(Categorical(["a", "b", np.nan, "a"])) self.assert_index_equal(s.cat.categories, Index(["a", "b"]))