Skip to content

Commit

Permalink
REF: clearer Categorical/CategoricalIndex construction (pandas-dev#24419
Browse files Browse the repository at this point in the history
)
  • Loading branch information
topper-123 authored and Pingviinituutti committed Feb 28, 2019
1 parent 64a3501 commit 747639d
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 76 deletions.
54 changes: 12 additions & 42 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,50 +316,19 @@ class Categorical(ExtensionArray, PandasObject):
def __init__(self, values, categories=None, ordered=None, dtype=None,
fastpath=False):

# Ways of specifying the dtype (prioritized ordered)
# 1. dtype is a CategoricalDtype
# a.) with known categories, use dtype.categories
# b.) else with Categorical values, use values.dtype
# c.) else, infer from values
# d.) specifying dtype=CategoricalDtype and categories is an error
# 2. dtype is a string 'category'
# a.) use categories, ordered
# b.) use values.dtype
# c.) infer from values
# 3. dtype is None
# a.) use categories, ordered
# b.) use values.dtype
# c.) infer from values
if dtype is not None:
# The dtype argument takes precedence over values.dtype (if any)
if isinstance(dtype, compat.string_types):
if dtype == 'category':
dtype = CategoricalDtype(categories, ordered)
else:
msg = "Unknown `dtype` {dtype}"
raise ValueError(msg.format(dtype=dtype))
elif categories is not None or ordered is not None:
raise ValueError("Cannot specify both `dtype` and `categories`"
" or `ordered`.")
elif is_categorical(values):
# If no "dtype" was passed, use the one from "values", but honor
# the "ordered" and "categories" arguments
dtype = values.dtype._from_categorical_dtype(values.dtype,
categories, ordered)
dtype = CategoricalDtype._from_values_or_dtype(values, categories,
ordered, dtype)
# At this point, dtype is always a CategoricalDtype, but
# we may have dtype.categories be None, and we need to
# infer categories in a factorization step futher below

if is_categorical(values):
# GH23814, for perf, if values._values already an instance of
# Categorical, set values to codes, and run fastpath
if (isinstance(values, (ABCSeries, ABCIndexClass)) and
isinstance(values._values, type(self))):
values = values._values.codes.copy()
fastpath = True
else:
# If dtype=None and values is not categorical, create a new dtype
dtype = CategoricalDtype(categories, ordered)

# At this point, dtype is always a CategoricalDtype and you should not
# use categories and ordered seperately.
# if dtype.categories is None, we are inferring

if fastpath:
self._codes = coerce_indexer_dtype(values, dtype.categories)
Expand Down Expand Up @@ -656,6 +625,9 @@ def from_codes(cls, codes, categories, ordered=False):
categorical. If not given, the resulting categorical will be
unordered.
"""
dtype = CategoricalDtype._from_values_or_dtype(codes, categories,
ordered)

codes = np.asarray(codes) # #21767
if not is_integer_dtype(codes):
msg = "codes need to be array-like integers"
Expand All @@ -675,14 +647,12 @@ def from_codes(cls, codes, categories, ordered=False):
raise ValueError(
"codes need to be convertible to an arrays of integers")

categories = CategoricalDtype.validate_categories(categories)

if len(codes) and (codes.max() >= len(categories) or codes.min() < -1):
if len(codes) and (
codes.max() >= len(dtype.categories) or codes.min() < -1):
raise ValueError("codes need to be between -1 and "
"len(categories)-1")

return cls(codes, categories=categories, ordered=ordered,
fastpath=True)
return cls(codes, dtype=dtype, fastpath=True)

_codes = None

Expand Down
90 changes: 89 additions & 1 deletion pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pandas import compat

from .base import ExtensionDtype, _DtypeOpsMixin
from .inference import is_list_like


def register_extension_dtype(cls):
Expand Down Expand Up @@ -240,6 +241,90 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None):
ordered = dtype.ordered
return cls(categories, ordered)

@classmethod
def _from_values_or_dtype(cls, values=None, categories=None, ordered=None,
dtype=None):
"""
Construct dtype from the input parameters used in :class:`Categorical`.
This constructor method specifically does not do the factorization
step, if that is needed to find the categories. This constructor may
therefore return ``CategoricalDtype(categories=None, ordered=None)``,
which may not be useful. Additional steps may therefore have to be
taken to create the final dtype.
The return dtype is specified from the inputs in this prioritized
order:
1. if dtype is a CategoricalDtype, return dtype
2. if dtype is the string 'category', create a CategoricalDtype from
the supplied categories and ordered parameters, and return that.
3. if values is a categorical, use value.dtype, but override it with
categories and ordered if either/both of those are not None.
4. if dtype is None and values is not a categorical, construct the
dtype from categories and ordered, even if either of those is None.
Parameters
----------
values : list-like, optional
The list-like must be 1-dimensional.
categories : list-like, optional
Categories for the CategoricalDtype.
ordered : bool, optional
Designating if the categories are ordered.
dtype : CategoricalDtype or the string "category", optional
If ``CategoricalDtype``, cannot be used together with
`categories` or `ordered`.
Returns
-------
CategoricalDtype
Examples
--------
>>> CategoricalDtype._from_values_or_dtype()
CategoricalDtype(categories=None, ordered=None)
>>> CategoricalDtype._from_values_or_dtype(categories=['a', 'b'],
... ordered=True)
CategoricalDtype(categories=['a', 'b'], ordered=True)
>>> dtype1 = CategoricalDtype(['a', 'b'], ordered=True)
>>> dtype2 = CategoricalDtype(['x', 'y'], ordered=False)
>>> c = Categorical([0, 1], dtype=dtype1, fastpath=True)
>>> CategoricalDtype._from_values_or_dtype(c, ['x', 'y'], ordered=True,
... dtype=dtype2)
ValueError: Cannot specify `categories` or `ordered` together with
`dtype`.
The supplied dtype takes precedence over values' dtype:
>>> CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
CategoricalDtype(['x', 'y'], ordered=False)
"""
from pandas.core.dtypes.common import is_categorical

if dtype is not None:
# The dtype argument takes precedence over values.dtype (if any)
if isinstance(dtype, compat.string_types):
if dtype == 'category':
dtype = CategoricalDtype(categories, ordered)
else:
msg = "Unknown dtype {dtype!r}"
raise ValueError(msg.format(dtype=dtype))
elif categories is not None or ordered is not None:
raise ValueError("Cannot specify `categories` or `ordered` "
"together with `dtype`.")
elif is_categorical(values):
# If no "dtype" was passed, use the one from "values", but honor
# the "ordered" and "categories" arguments
dtype = values.dtype._from_categorical_dtype(values.dtype,
categories, ordered)
else:
# If dtype=None and values is not categorical, create a new dtype.
# Note: This could potentially have categories=None and
# ordered=None.
dtype = CategoricalDtype(categories, ordered)

return dtype

def _finalize(self, categories, ordered, fastpath=False):

if ordered is not None:
Expand Down Expand Up @@ -408,7 +493,10 @@ def validate_categories(categories, fastpath=False):
"""
from pandas import Index

if not isinstance(categories, ABCIndexClass):
if not fastpath and not is_list_like(categories):
msg = "Parameter 'categories' must be list-like, was {!r}"
raise TypeError(msg.format(categories))
elif not isinstance(categories, ABCIndexClass):
categories = Index(categories, tupleize_cols=False)

if not fastpath:
Expand Down
42 changes: 13 additions & 29 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,29 +107,23 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
if fastpath:
return cls._simple_new(data, name=name, dtype=dtype)

dtype = CategoricalDtype._from_values_or_dtype(data, categories,
ordered, dtype)

if name is None and hasattr(data, 'name'):
name = data.name

if isinstance(data, ABCCategorical):
data = cls._create_categorical(data, categories, ordered,
dtype)
elif isinstance(data, CategoricalIndex):
data = data._data
data = cls._create_categorical(data, categories, ordered,
dtype)
else:

if not is_categorical_dtype(data):
# don't allow scalars
# if data is None, then categories must be provided
if is_scalar(data):
if data is not None or categories is None:
cls._scalar_data_error(data)
data = []
data = cls._create_categorical(data, categories, ordered,
dtype)

if copy:
data = data.copy()
data = cls._create_categorical(data, dtype=dtype)

data = data.copy() if copy else data

return cls._simple_new(data, name=name)

Expand Down Expand Up @@ -159,8 +153,7 @@ def _create_from_codes(self, codes, dtype=None, name=None):
return CategoricalIndex(cat, name=name)

@classmethod
def _create_categorical(cls, data, categories=None, ordered=None,
dtype=None):
def _create_categorical(cls, data, dtype=None):
"""
*this is an internal non-public method*
Expand All @@ -169,8 +162,6 @@ def _create_categorical(cls, data, categories=None, ordered=None,
Parameters
----------
data : data for new Categorical
categories : optional categories, defaults to existing
ordered : optional ordered attribute, defaults to existing
dtype : CategoricalDtype, defaults to existing
Returns
Expand All @@ -182,18 +173,11 @@ def _create_categorical(cls, data, categories=None, ordered=None,
data = data.values

if not isinstance(data, ABCCategorical):
if ordered is None and dtype is None:
ordered = False
data = Categorical(data, categories=categories, ordered=ordered,
dtype=dtype)
else:
if categories is not None:
data = data.set_categories(categories, ordered=ordered)
elif ordered is not None and ordered != data.ordered:
data = data.set_ordered(ordered)
if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
# we want to silently ignore dtype='category'
data = data._set_dtype(dtype)
return Categorical(data, dtype=dtype)

if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
# we want to silently ignore dtype='category'
data = data._set_dtype(dtype)
return data

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ def test_constructor_str_category(self, categories, ordered):
tm.assert_categorical_equal(result, expected)

def test_constructor_str_unknown(self):
with pytest.raises(ValueError, match="Unknown `dtype`"):
with pytest.raises(ValueError, match="Unknown dtype"):
Categorical([1, 2], dtype="foo")

def test_constructor_from_categorical_with_dtype(self):
Expand Down
34 changes: 32 additions & 2 deletions pandas/tests/dtypes/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,40 @@ def test_construction_from_string(self):
TypeError, lambda: CategoricalDtype.construct_from_string('foo'))

def test_constructor_invalid(self):
msg = "CategoricalIndex.* must be called"
msg = "Parameter 'categories' must be list-like"
with pytest.raises(TypeError, match=msg):
CategoricalDtype("category")

dtype1 = CategoricalDtype(['a', 'b'], ordered=True)
dtype2 = CategoricalDtype(['x', 'y'], ordered=False)
c = Categorical([0, 1], dtype=dtype1, fastpath=True)

@pytest.mark.parametrize('values, categories, ordered, dtype, expected',
[
[None, None, None, None,
CategoricalDtype()],
[None, ['a', 'b'], True, None, dtype1],
[c, None, None, dtype2, dtype2],
[c, ['x', 'y'], False, None, dtype2],
])
def test_from_values_or_dtype(
self, values, categories, ordered, dtype, expected):
result = CategoricalDtype._from_values_or_dtype(values, categories,
ordered, dtype)
assert result == expected

@pytest.mark.parametrize('values, categories, ordered, dtype', [
[None, ['a', 'b'], True, dtype2],
[None, ['a', 'b'], None, dtype2],
[None, None, True, dtype2],
])
def test_from_values_or_dtype_raises(self, values, categories,
ordered, dtype):
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
with pytest.raises(ValueError, match=msg):
CategoricalDtype._from_values_or_dtype(values, categories,
ordered, dtype)

def test_is_dtype(self):
assert CategoricalDtype.is_dtype(self.dtype)
assert CategoricalDtype.is_dtype('category')
Expand Down Expand Up @@ -706,7 +736,7 @@ def test_invalid_raises(self):
with pytest.raises(TypeError, match='ordered'):
CategoricalDtype(['a', 'b'], ordered='foo')

with pytest.raises(TypeError, match='collection'):
with pytest.raises(TypeError, match="'categories' must be list-like"):
CategoricalDtype('category')

def test_mixed(self):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def test_construction_with_categorical_dtype(self):
tm.assert_index_equal(result, expected, exact=True)

# error when combining categories/ordered and dtype kwargs
msg = 'Cannot specify both `dtype` and `categories` or `ordered`.'
msg = 'Cannot specify `categories` or `ordered` together with `dtype`.'
with pytest.raises(ValueError, match=msg):
CategoricalIndex(data, categories=cats, dtype=dtype)

Expand Down

0 comments on commit 747639d

Please sign in to comment.