Skip to content

Commit

Permalink
DEPR: Drop support for NaN categories in Categorical
Browse files Browse the repository at this point in the history
Deprecated in 0.17.0.
xref #10748
xref #13648

Author: Jeff Reback <jeff@reback.net>
Author: gfyoung <gfyoung17@gmail.com>

Closes #15806 from gfyoung/categories-nan-drop and squashes the following commits:

318175b [Jeff Reback] TST: test pd.NaT with correct dtype
4dce349 [gfyoung] Drop support for NaN categories in Categorical
  • Loading branch information
jreback committed Mar 27, 2017
1 parent 056c0a6 commit 80280ec
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 203 deletions.
9 changes: 9 additions & 0 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,15 @@ Categories must be unique or a `ValueError` is raised:
except ValueError as e:
print("ValueError: " + str(e))
Categories must also not be ``NaN`` or a `ValueError` is raised:

.. ipython:: python
try:
s.cat.categories = [1,2,np.nan]
except ValueError as e:
print("ValueError: " + str(e))
Appending new categories
~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,7 @@ Removal of prior version deprecations/changes
in favor of ``iloc`` and ``iat`` as explained :ref:`here <whatsnew_0170.deprecations>` (:issue:`10711`).
- The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`)
- The ``Categorical`` constructor has dropped the ``name`` parameter (:issue:`10632`)
- ``Categorical`` has dropped support for ``NaN`` categories (:issue:`10748`)
- The ``take_last`` parameter has been dropped from ``duplicated()``, ``drop_duplicates()``, ``nlargest()``, and ``nsmallest()`` methods (:issue:`10236`, :issue:`10792`, :issue:`10920`)
- ``Series``, ``Index``, and ``DataFrame`` have dropped the ``sort`` and ``order`` methods (:issue:`10726`)
- Where clauses in ``pytables`` are only accepted as strings and expressions types and not other data-types (:issue:`12027`)
Expand Down
13 changes: 3 additions & 10 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,18 +545,11 @@ def _validate_categories(cls, categories, fastpath=False):

if not fastpath:

# check properties of the categories
# we don't allow NaNs in the categories themselves

# Categories cannot contain NaN.
if categories.hasnans:
# NaNs in cats deprecated in 0.17
# GH 10748
msg = ('\nSetting NaNs in `categories` is deprecated and '
'will be removed in a future version of pandas.')
warn(msg, FutureWarning, stacklevel=3)

# categories must be unique
raise ValueError('Categorial categories cannot be null')

# Categories must be unique.
if not categories.is_unique:
raise ValueError('Categorical categories must be unique')

Expand Down
14 changes: 0 additions & 14 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,6 @@ def test_contains(self):
self.assertFalse(0 in ci)
self.assertFalse(1 in ci)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
ci = CategoricalIndex(
list('aabbca'), categories=list('cabdef') + [np.nan])
self.assertFalse(np.nan in ci)

ci = CategoricalIndex(
list('aabbca') + [np.nan], categories=list('cabdef'))
self.assertTrue(np.nan in ci)
Expand Down Expand Up @@ -541,7 +536,6 @@ def test_ensure_copied_data(self):
self.assertIs(_base(index.values), _base(result.values))

def test_equals_categorical(self):

ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True)
ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'],
ordered=True)
Expand Down Expand Up @@ -579,14 +573,6 @@ def test_equals_categorical(self):
self.assertFalse(ci.equals(CategoricalIndex(list('aabca'))))
self.assertTrue(ci.equals(ci.copy()))

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
ci = CategoricalIndex(list('aabca'),
categories=['c', 'a', 'b', np.nan])
self.assertFalse(ci.equals(list('aabca')))
self.assertFalse(ci.equals(CategoricalIndex(list('aabca'))))
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
self.assertTrue(ci.equals(ci.copy()))

ci = CategoricalIndex(list('aabca') + [np.nan],
categories=['c', 'a', 'b'])
self.assertFalse(ci.equals(list('aabca')))
Expand Down
207 changes: 28 additions & 179 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
# pylint: disable=E1101,E1103,W0232

import pytest
import sys
from datetime import datetime
from distutils.version import LooseVersion
Expand All @@ -17,7 +18,8 @@
import pandas.compat as compat
import pandas.util.testing as tm
from pandas import (Categorical, Index, Series, DataFrame, PeriodIndex,
Timestamp, CategoricalIndex, isnull)
Timestamp, CategoricalIndex, DatetimeIndex,
isnull, NaT)
from pandas.compat import range, lrange, u, PY3
from pandas.core.config import option_context

Expand Down Expand Up @@ -160,12 +162,6 @@ def f():

self.assertRaises(ValueError, f)

def f():
with tm.assert_produces_warning(FutureWarning):
Categorical([1, 2], [1, 2, np.nan, np.nan])

self.assertRaises(ValueError, f)

# The default should be unordered
c1 = Categorical(["a", "b", "c", "a"])
self.assertFalse(c1.ordered)
Expand Down Expand Up @@ -222,29 +218,12 @@ def f():
cat = pd.Categorical([np.nan, 1., 2., 3.])
self.assertTrue(is_float_dtype(cat.categories))

# Deprecating NaNs in categoires (GH #10748)
# preserve int as far as possible by converting to object if NaN is in
# categories
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical([np.nan, 1, 2, 3],
categories=[np.nan, 1, 2, 3])
self.assertTrue(is_object_dtype(cat.categories))

# This doesn't work -> this would probably need some kind of "remember
# the original type" feature to try to cast the array interface result
# to...

# vals = np.asarray(cat[cat.notnull()])
# self.assertTrue(is_integer_dtype(vals))
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical([np.nan, "a", "b", "c"],
categories=[np.nan, "a", "b", "c"])
self.assertTrue(is_object_dtype(cat.categories))
# but don't do it for floats
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical([np.nan, 1., 2., 3.],
categories=[np.nan, 1., 2., 3.])
self.assertTrue(is_float_dtype(cat.categories))

# corner cases
cat = pd.Categorical([1])
Expand Down Expand Up @@ -295,6 +274,22 @@ def f():
c = Categorical(np.array([], dtype='int64'), # noqa
categories=[3, 2, 1], ordered=True)

def test_constructor_with_null(self):

# Cannot have NaN in categories
with pytest.raises(ValueError):
pd.Categorical([np.nan, "a", "b", "c"],
categories=[np.nan, "a", "b", "c"])

with pytest.raises(ValueError):
pd.Categorical([None, "a", "b", "c"],
categories=[None, "a", "b", "c"])

with pytest.raises(ValueError):
pd.Categorical(DatetimeIndex(['nat', '20160101']),
categories=[NaT, Timestamp('20160101')])


def test_constructor_with_index(self):
ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
tm.assert_categorical_equal(ci.values, Categorical(ci))
Expand Down Expand Up @@ -418,6 +413,12 @@ def f():

self.assertRaises(ValueError, f)

# NaN categories included
def f():
Categorical.from_codes([0, 1, 2], ["a", "b", np.nan])

self.assertRaises(ValueError, f)

# too negative
def f():
Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
Expand Down Expand Up @@ -649,30 +650,6 @@ def test_describe(self):
name='categories'))
tm.assert_frame_equal(desc, expected)

# NA as a category
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical(["a", "c", "c", np.nan],
categories=["b", "a", "c", np.nan])
result = cat.describe()

expected = DataFrame([[0, 0], [1, 0.25], [2, 0.5], [1, 0.25]],
columns=['counts', 'freqs'],
index=pd.CategoricalIndex(['b', 'a', 'c', np.nan],
name='categories'))
tm.assert_frame_equal(result, expected, check_categorical=False)

# NA as an unused category
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical(["a", "c", "c"],
categories=["b", "a", "c", np.nan])
result = cat.describe()

exp_idx = pd.CategoricalIndex(
['b', 'a', 'c', np.nan], name='categories')
expected = DataFrame([[0, 0], [1, 1 / 3.], [2, 2 / 3.], [0, 0]],
columns=['counts', 'freqs'], index=exp_idx)
tm.assert_frame_equal(result, expected, check_categorical=False)

def test_print(self):
expected = ["[a, b, b, a, a, c, c, c]",
"Categories (3, object): [a < b < c]"]
Expand Down Expand Up @@ -1119,90 +1096,18 @@ def test_nan_handling(self):
self.assert_numpy_array_equal(c._codes,
np.array([0, -1, -1, 0], dtype=np.int8))

# If categories have nan included, the code should point to that
# instead
with tm.assert_produces_warning(FutureWarning):
c = Categorical(["a", "b", np.nan, "a"],
categories=["a", "b", np.nan])
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 1, 2, 0], dtype=np.int8))
c[1] = np.nan
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 2, 2, 0], dtype=np.int8))

# Changing categories should also make the replaced category np.nan
c = Categorical(["a", "b", "c", "a"])
with tm.assert_produces_warning(FutureWarning):
c.categories = ["a", "b", np.nan] # noqa

self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 1, 2, 0], dtype=np.int8))

# Adding nan to categories should make assigned nan point to the
# category!
c = Categorical(["a", "b", np.nan, "a"])
self.assert_index_equal(c.categories, Index(["a", "b"]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 1, -1, 0], dtype=np.int8))
with tm.assert_produces_warning(FutureWarning):
c.set_categories(["a", "b", np.nan], rename=True, inplace=True)

self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 1, -1, 0], dtype=np.int8))
c[1] = np.nan
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 2, -1, 0], dtype=np.int8))

# Remove null categories (GH 10156)
cases = [([1.0, 2.0, np.nan], [1.0, 2.0]),
(['a', 'b', None], ['a', 'b']),
([pd.Timestamp('2012-05-01'), pd.NaT],
[pd.Timestamp('2012-05-01')])]

null_values = [np.nan, None, pd.NaT]

for with_null, without in cases:
with tm.assert_produces_warning(FutureWarning):
base = Categorical([], with_null)
expected = Categorical([], without)

for nullval in null_values:
result = base.remove_categories(nullval)
self.assert_categorical_equal(result, expected)

# Different null values are indistinguishable
for i, j in [(0, 1), (0, 2), (1, 2)]:
nulls = [null_values[i], null_values[j]]

def f():
with tm.assert_produces_warning(FutureWarning):
Categorical([], categories=nulls)

self.assertRaises(ValueError, f)

def test_isnull(self):
exp = np.array([False, False, True])
c = Categorical(["a", "b", np.nan])
res = c.isnull()
self.assert_numpy_array_equal(res, exp)

with tm.assert_produces_warning(FutureWarning):
c = Categorical(["a", "b", np.nan], categories=["a", "b", np.nan])
res = c.isnull()
self.assert_numpy_array_equal(res, exp)

# test both nan in categories and as -1
exp = np.array([True, False, True])
c = Categorical(["a", "b", np.nan])
with tm.assert_produces_warning(FutureWarning):
c.set_categories(["a", "b", np.nan], rename=True, inplace=True)
c[0] = np.nan
res = c.isnull()
self.assert_numpy_array_equal(res, exp)

def test_codes_immutable(self):
Expand Down Expand Up @@ -1487,45 +1392,10 @@ def test_slicing_directly(self):

def test_set_item_nan(self):
cat = pd.Categorical([1, 2, 3])
exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3])
cat[1] = np.nan
tm.assert_categorical_equal(cat, exp)

# if nan in categories, the proper code should be set!
cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[1] = np.nan
exp = np.array([0, 3, 2, -1], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)

cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[1:3] = np.nan
exp = np.array([0, 3, 3, -1], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)

cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[1:3] = [np.nan, 1]
exp = np.array([0, 3, 0, -1], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)

cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[1:3] = [np.nan, np.nan]
exp = np.array([0, 3, 3, -1], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)

cat = pd.Categorical([1, 2, np.nan, 3], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[pd.isnull(cat)] = np.nan
exp = np.array([0, 1, 3, 2], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)
exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3])
tm.assert_categorical_equal(cat, exp)

def test_shift(self):
# GH 9416
Expand Down Expand Up @@ -2026,33 +1896,12 @@ def test_sideeffects_free(self):

def test_nan_handling(self):

# Nans are represented as -1 in labels
# NaNs are represented as -1 in labels
s = Series(Categorical(["a", "b", np.nan, "a"]))
self.assert_index_equal(s.cat.categories, Index(["a", "b"]))
self.assert_numpy_array_equal(s.values.codes,
np.array([0, 1, -1, 0], dtype=np.int8))

# If categories have nan included, the label should point to that
# instead
with tm.assert_produces_warning(FutureWarning):
s2 = Series(Categorical(["a", "b", np.nan, "a"],
categories=["a", "b", np.nan]))

exp_cat = Index(["a", "b", np.nan])
self.assert_index_equal(s2.cat.categories, exp_cat)
self.assert_numpy_array_equal(s2.values.codes,
np.array([0, 1, 2, 0], dtype=np.int8))

# Changing categories should also make the replaced category np.nan
s3 = Series(Categorical(["a", "b", "c", "a"]))
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
s3.cat.categories = ["a", "b", np.nan]

exp_cat = Index(["a", "b", np.nan])
self.assert_index_equal(s3.cat.categories, exp_cat)
self.assert_numpy_array_equal(s3.values.codes,
np.array([0, 1, 2, 0], dtype=np.int8))

def test_cat_accessor(self):
s = Series(Categorical(["a", "b", np.nan, "a"]))
self.assert_index_equal(s.cat.categories, Index(["a", "b"]))
Expand Down

0 comments on commit 80280ec

Please sign in to comment.