From 7818486859d1aba53ce359b93cfc772e688958e5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 19 Aug 2017 06:27:05 -0500 Subject: [PATCH] BUG: Have object dtype for empty Categorical.categories (#17249) * BUG: Have object dtype for empty Categorical ctor Previously we had a `Float64Index`, which is inconsistent with, e.g., the regular Index constructor. * TST: Update tests in multi for new return Previously these relied worked around the return type by wrapping list-likes in `np.array` and relying on that to cast to float. These workarounds are no longer nescessary. * TST: Update union_categorical tests This relied on `NaN` being a float and empty being a float. Not a necessary test anymore. * TST: set object dtype --- doc/source/whatsnew/v0.21.0.txt | 3 +++ pandas/core/categorical.py | 5 ++++- pandas/tests/indexes/test_multi.py | 9 ++++----- pandas/tests/reshape/test_concat.py | 2 +- pandas/tests/reshape/test_union_categoricals.py | 12 +++--------- pandas/tests/test_categorical.py | 10 ++++++++++ 6 files changed, 25 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 4f55c6388c728..6008ea5d4cbcd 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -385,6 +385,9 @@ Numeric Categorical ^^^^^^^^^^^ - Bug in :func:`Series.isin` when called with a categorical (:issue`16639`) +- Bug in the categorical constructor with empty values and categories causing + the ``.categories`` to be an empty ``Float64Index`` rather than an empty + ``Index`` with object dtype (:issue:`17248`) Other diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 230361931125e..1c2a29333001c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -290,7 +290,10 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): # On list with NaNs, int values will be converted to float. Use # "object" dtype to prevent this. In the end objects will be # casted to int/... in the category assignment step. - dtype = 'object' if isna(values).any() else None + if len(values) == 0 or isna(values).any(): + dtype = 'object' + else: + dtype = None values = _sanitize_array(values, None, dtype=dtype) if categories is None: diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index da1b309f5a621..c66775f4690cc 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -776,7 +776,7 @@ def test_from_arrays_empty(self): arrays = [[]] * N names = list('ABC')[:N] result = MultiIndex.from_arrays(arrays=arrays, names=names) - expected = MultiIndex(levels=[np.array([])] * N, labels=[[]] * N, + expected = MultiIndex(levels=[[]] * N, labels=[[]] * N, names=names) tm.assert_index_equal(result, expected) @@ -829,7 +829,7 @@ def test_from_product_empty(self): # 1 level result = MultiIndex.from_product([[]], names=['A']) - expected = pd.Float64Index([], name='A') + expected = pd.Index([], name='A') tm.assert_index_equal(result, expected) # 2 levels @@ -838,7 +838,7 @@ def test_from_product_empty(self): names = ['A', 'B'] for first, second in zip(l1, l2): result = MultiIndex.from_product([first, second], names=names) - expected = MultiIndex(levels=[np.array(first), np.array(second)], + expected = MultiIndex(levels=[first, second], labels=[[], []], names=names) tm.assert_index_equal(result, expected) @@ -847,8 +847,7 @@ def test_from_product_empty(self): for N in range(4): lvl2 = lrange(N) result = MultiIndex.from_product([[], lvl2, []], names=names) - expected = MultiIndex(levels=[np.array(A) - for A in [[], lvl2, []]], + expected = MultiIndex(levels=[[], lvl2, []], labels=[[], [], []], names=names) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 46fea86c45925..52cd18126859a 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -680,7 +680,7 @@ def test_concat_categorical_empty(self): tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) s1 = pd.Series([], dtype='category') - s2 = pd.Series([]) + s2 = pd.Series([], dtype='object') # different dtype => not-category tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index fe8d54005ba9b..eb80fb54b4016 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -107,17 +107,11 @@ def test_union_categoricals_empty(self): exp = Categorical([]) tm.assert_categorical_equal(res, exp) - res = union_categoricals([pd.Categorical([]), - pd.Categorical([1.0])]) - exp = Categorical([1.0]) + res = union_categoricals([Categorical([]), + Categorical(['1'])]) + exp = Categorical(['1']) tm.assert_categorical_equal(res, exp) - # to make dtype equal - nanc = pd.Categorical(np.array([np.nan], dtype=np.float64)) - res = union_categoricals([nanc, - pd.Categorical([])]) - tm.assert_categorical_equal(res, nanc) - def test_union_categorical_same_category(self): # check fastpath c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index a0b585a16ad9a..7bbe220378993 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -112,6 +112,16 @@ def test_setitem_listlike(self): result = c.codes[np.array([100000]).astype(np.int64)] tm.assert_numpy_array_equal(result, np.array([5], dtype='int8')) + def test_constructor_empty(self): + # GH 17248 + c = Categorical([]) + expected = Index([]) + tm.assert_index_equal(c.categories, expected) + + c = Categorical([], categories=[1, 2, 3]) + expected = pd.Int64Index([1, 2, 3]) + tm.assert_index_equal(c.categories, expected) + def test_constructor_unsortable(self): # it works!