Skip to content

Commit

Permalink
Drop support for NaN categories in Categorical
Browse files Browse the repository at this point in the history
Deprecated in 0.17.0.

xref gh-10748
  • Loading branch information
gfyoung committed Mar 26, 2017
1 parent c577c19 commit 84fc041
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 202 deletions.
9 changes: 9 additions & 0 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,15 @@ Categories must be unique or a `ValueError` is raised:
except ValueError as e:
print("ValueError: " + str(e))
Categories must also not be ``NaN`` or a `ValueError` is raised:

.. ipython:: python
try:
s.cat.categories = [1,2,np.nan]
except ValueError as e:
print("ValueError: " + str(e))
Appending new categories
~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -811,6 +811,7 @@ Removal of prior version deprecations/changes
in favor of ``iloc`` and ``iat`` as explained :ref:`here <whatsnew_0170.deprecations>` (:issue:`10711`).
- The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`)
- The ``Categorical`` constructor has dropped the ``name`` parameter (:issue:`10632`)
- ``Categorical`` has dropped support for ``NaN`` categories (:issue:`10748`)
- The ``take_last`` parameter has been dropped from ``duplicated()``, ``drop_duplicates()``, ``nlargest()``, and ``nsmallest()`` methods (:issue:`10236`, :issue:`10792`, :issue:`10920`)
- ``Series``, ``Index``, and ``DataFrame`` have dropped the ``sort`` and ``order`` methods (:issue:`10726`)
- Where clauses in ``pytables`` are only accepted as strings and expressions types and not other data-types (:issue:`12027`)
Expand Down
13 changes: 3 additions & 10 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,18 +545,11 @@ def _validate_categories(cls, categories, fastpath=False):

if not fastpath:

# check properties of the categories
# we don't allow NaNs in the categories themselves

# Categories cannot contain NaN.
if categories.hasnans:
# NaNs in cats deprecated in 0.17
# GH 10748
msg = ('\nSetting NaNs in `categories` is deprecated and '
'will be removed in a future version of pandas.')
warn(msg, FutureWarning, stacklevel=3)

# categories must be unique
raise ValueError('Categorial categories cannot be NaN')

# Categories must be unique.
if not categories.is_unique:
raise ValueError('Categorical categories must be unique')

Expand Down
14 changes: 0 additions & 14 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,6 @@ def test_contains(self):
self.assertFalse(0 in ci)
self.assertFalse(1 in ci)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
ci = CategoricalIndex(
list('aabbca'), categories=list('cabdef') + [np.nan])
self.assertFalse(np.nan in ci)

ci = CategoricalIndex(
list('aabbca') + [np.nan], categories=list('cabdef'))
self.assertTrue(np.nan in ci)
Expand Down Expand Up @@ -541,7 +536,6 @@ def test_ensure_copied_data(self):
self.assertIs(_base(index.values), _base(result.values))

def test_equals_categorical(self):

ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True)
ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'],
ordered=True)
Expand Down Expand Up @@ -579,14 +573,6 @@ def test_equals_categorical(self):
self.assertFalse(ci.equals(CategoricalIndex(list('aabca'))))
self.assertTrue(ci.equals(ci.copy()))

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
ci = CategoricalIndex(list('aabca'),
categories=['c', 'a', 'b', np.nan])
self.assertFalse(ci.equals(list('aabca')))
self.assertFalse(ci.equals(CategoricalIndex(list('aabca'))))
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
self.assertTrue(ci.equals(ci.copy()))

ci = CategoricalIndex(list('aabca') + [np.nan],
categories=['c', 'a', 'b'])
self.assertFalse(ci.equals(list('aabca')))
Expand Down
194 changes: 16 additions & 178 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,6 @@ def f():

self.assertRaises(ValueError, f)

def f():
with tm.assert_produces_warning(FutureWarning):
Categorical([1, 2], [1, 2, np.nan, np.nan])

self.assertRaises(ValueError, f)

# The default should be unordered
c1 = Categorical(["a", "b", "c", "a"])
self.assertFalse(c1.ordered)
Expand Down Expand Up @@ -222,29 +216,19 @@ def f():
cat = pd.Categorical([np.nan, 1., 2., 3.])
self.assertTrue(is_float_dtype(cat.categories))

# Deprecating NaNs in categoires (GH #10748)
# preserve int as far as possible by converting to object if NaN is in
# categories
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical([np.nan, 1, 2, 3],
categories=[np.nan, 1, 2, 3])
self.assertTrue(is_object_dtype(cat.categories))

# This doesn't work -> this would probably need some kind of "remember
# the original type" feature to try to cast the array interface result
# to...

# vals = np.asarray(cat[cat.notnull()])
# self.assertTrue(is_integer_dtype(vals))
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical([np.nan, "a", "b", "c"],
categories=[np.nan, "a", "b", "c"])
self.assertTrue(is_object_dtype(cat.categories))
# but don't do it for floats
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical([np.nan, 1., 2., 3.],
categories=[np.nan, 1., 2., 3.])
self.assertTrue(is_float_dtype(cat.categories))

# Cannot have NaN in categories
def f():
pd.Categorical([np.nan, "a", "b", "c"],
categories=[np.nan, "a", "b", "c"])

self.assertRaises(ValueError, f)

# corner cases
cat = pd.Categorical([1])
Expand Down Expand Up @@ -418,6 +402,12 @@ def f():

self.assertRaises(ValueError, f)

# NaN categories included
def f():
Categorical.from_codes([0, 1, 2], ["a", "b", np.nan])

self.assertRaises(ValueError, f)

# too negative
def f():
Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
Expand Down Expand Up @@ -649,30 +639,6 @@ def test_describe(self):
name='categories'))
tm.assert_frame_equal(desc, expected)

# NA as a category
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical(["a", "c", "c", np.nan],
categories=["b", "a", "c", np.nan])
result = cat.describe()

expected = DataFrame([[0, 0], [1, 0.25], [2, 0.5], [1, 0.25]],
columns=['counts', 'freqs'],
index=pd.CategoricalIndex(['b', 'a', 'c', np.nan],
name='categories'))
tm.assert_frame_equal(result, expected, check_categorical=False)

# NA as an unused category
with tm.assert_produces_warning(FutureWarning):
cat = pd.Categorical(["a", "c", "c"],
categories=["b", "a", "c", np.nan])
result = cat.describe()

exp_idx = pd.CategoricalIndex(
['b', 'a', 'c', np.nan], name='categories')
expected = DataFrame([[0, 0], [1, 1 / 3.], [2, 2 / 3.], [0, 0]],
columns=['counts', 'freqs'], index=exp_idx)
tm.assert_frame_equal(result, expected, check_categorical=False)

def test_print(self):
expected = ["[a, b, b, a, a, c, c, c]",
"Categories (3, object): [a < b < c]"]
Expand Down Expand Up @@ -1119,90 +1085,18 @@ def test_nan_handling(self):
self.assert_numpy_array_equal(c._codes,
np.array([0, -1, -1, 0], dtype=np.int8))

# If categories have nan included, the code should point to that
# instead
with tm.assert_produces_warning(FutureWarning):
c = Categorical(["a", "b", np.nan, "a"],
categories=["a", "b", np.nan])
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 1, 2, 0], dtype=np.int8))
c[1] = np.nan
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 2, 2, 0], dtype=np.int8))

# Changing categories should also make the replaced category np.nan
c = Categorical(["a", "b", "c", "a"])
with tm.assert_produces_warning(FutureWarning):
c.categories = ["a", "b", np.nan] # noqa

self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 1, 2, 0], dtype=np.int8))

# Adding nan to categories should make assigned nan point to the
# category!
c = Categorical(["a", "b", np.nan, "a"])
self.assert_index_equal(c.categories, Index(["a", "b"]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 1, -1, 0], dtype=np.int8))
with tm.assert_produces_warning(FutureWarning):
c.set_categories(["a", "b", np.nan], rename=True, inplace=True)

self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 1, -1, 0], dtype=np.int8))
c[1] = np.nan
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
self.assert_numpy_array_equal(c._codes,
np.array([0, 2, -1, 0], dtype=np.int8))

# Remove null categories (GH 10156)
cases = [([1.0, 2.0, np.nan], [1.0, 2.0]),
(['a', 'b', None], ['a', 'b']),
([pd.Timestamp('2012-05-01'), pd.NaT],
[pd.Timestamp('2012-05-01')])]

null_values = [np.nan, None, pd.NaT]

for with_null, without in cases:
with tm.assert_produces_warning(FutureWarning):
base = Categorical([], with_null)
expected = Categorical([], without)

for nullval in null_values:
result = base.remove_categories(nullval)
self.assert_categorical_equal(result, expected)

# Different null values are indistinguishable
for i, j in [(0, 1), (0, 2), (1, 2)]:
nulls = [null_values[i], null_values[j]]

def f():
with tm.assert_produces_warning(FutureWarning):
Categorical([], categories=nulls)

self.assertRaises(ValueError, f)

def test_isnull(self):
exp = np.array([False, False, True])
c = Categorical(["a", "b", np.nan])
res = c.isnull()
self.assert_numpy_array_equal(res, exp)

with tm.assert_produces_warning(FutureWarning):
c = Categorical(["a", "b", np.nan], categories=["a", "b", np.nan])
res = c.isnull()
self.assert_numpy_array_equal(res, exp)

# test both nan in categories and as -1
exp = np.array([True, False, True])
c = Categorical(["a", "b", np.nan])
with tm.assert_produces_warning(FutureWarning):
c.set_categories(["a", "b", np.nan], rename=True, inplace=True)
c[0] = np.nan
res = c.isnull()
self.assert_numpy_array_equal(res, exp)

def test_codes_immutable(self):
Expand Down Expand Up @@ -1487,45 +1381,10 @@ def test_slicing_directly(self):

def test_set_item_nan(self):
cat = pd.Categorical([1, 2, 3])
exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3])
cat[1] = np.nan
tm.assert_categorical_equal(cat, exp)

# if nan in categories, the proper code should be set!
cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[1] = np.nan
exp = np.array([0, 3, 2, -1], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)

cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[1:3] = np.nan
exp = np.array([0, 3, 3, -1], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)

cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[1:3] = [np.nan, 1]
exp = np.array([0, 3, 0, -1], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)

cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[1:3] = [np.nan, np.nan]
exp = np.array([0, 3, 3, -1], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)

cat = pd.Categorical([1, 2, np.nan, 3], categories=[1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
cat[pd.isnull(cat)] = np.nan
exp = np.array([0, 1, 3, 2], dtype=np.int8)
self.assert_numpy_array_equal(cat.codes, exp)
exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3])
tm.assert_categorical_equal(cat, exp)

def test_shift(self):
# GH 9416
Expand Down Expand Up @@ -2026,33 +1885,12 @@ def test_sideeffects_free(self):

def test_nan_handling(self):

# Nans are represented as -1 in labels
# NaNs are represented as -1 in labels
s = Series(Categorical(["a", "b", np.nan, "a"]))
self.assert_index_equal(s.cat.categories, Index(["a", "b"]))
self.assert_numpy_array_equal(s.values.codes,
np.array([0, 1, -1, 0], dtype=np.int8))

# If categories have nan included, the label should point to that
# instead
with tm.assert_produces_warning(FutureWarning):
s2 = Series(Categorical(["a", "b", np.nan, "a"],
categories=["a", "b", np.nan]))

exp_cat = Index(["a", "b", np.nan])
self.assert_index_equal(s2.cat.categories, exp_cat)
self.assert_numpy_array_equal(s2.values.codes,
np.array([0, 1, 2, 0], dtype=np.int8))

# Changing categories should also make the replaced category np.nan
s3 = Series(Categorical(["a", "b", "c", "a"]))
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
s3.cat.categories = ["a", "b", np.nan]

exp_cat = Index(["a", "b", np.nan])
self.assert_index_equal(s3.cat.categories, exp_cat)
self.assert_numpy_array_equal(s3.values.codes,
np.array([0, 1, 2, 0], dtype=np.int8))

def test_cat_accessor(self):
s = Series(Categorical(["a", "b", np.nan, "a"]))
self.assert_index_equal(s.cat.categories, Index(["a", "b"]))
Expand Down

0 comments on commit 84fc041

Please sign in to comment.