REF: clearer Categorical/CategoricalIndex construction (pandas-dev#24419

)
Pingviinituutti · Feb 28, 2019 · 747639d · 747639d
1 parent 64a3501
commit 747639d
Show file tree

Hide file tree

Showing 6 changed files with 148 additions and 76 deletions.
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -316,50 +316,19 @@ class Categorical(ExtensionArray, PandasObject):
     def __init__(self, values, categories=None, ordered=None, dtype=None,
                  fastpath=False):
 
-        # Ways of specifying the dtype (prioritized ordered)
-        # 1. dtype is a CategoricalDtype
-        #    a.) with known categories, use dtype.categories
-        #    b.) else with Categorical values, use values.dtype
-        #    c.) else, infer from values
-        #    d.) specifying dtype=CategoricalDtype and categories is an error
-        # 2. dtype is a string 'category'
-        #    a.) use categories, ordered
-        #    b.) use values.dtype
-        #    c.) infer from values
-        # 3. dtype is None
-        #    a.) use categories, ordered
-        #    b.) use values.dtype
-        #    c.) infer from values
-        if dtype is not None:
-            # The dtype argument takes precedence over values.dtype (if any)
-            if isinstance(dtype, compat.string_types):
-                if dtype == 'category':
-                    dtype = CategoricalDtype(categories, ordered)
-                else:
-                    msg = "Unknown `dtype` {dtype}"
-                    raise ValueError(msg.format(dtype=dtype))
-            elif categories is not None or ordered is not None:
-                raise ValueError("Cannot specify both `dtype` and `categories`"
-                                 " or `ordered`.")
-        elif is_categorical(values):
-            # If no "dtype" was passed, use the one from "values", but honor
-            # the "ordered" and "categories" arguments
-            dtype = values.dtype._from_categorical_dtype(values.dtype,
-                                                         categories, ordered)
+        dtype = CategoricalDtype._from_values_or_dtype(values, categories,
+                                                       ordered, dtype)
+        # At this point, dtype is always a CategoricalDtype, but
+        # we may have dtype.categories be None, and we need to
+        # infer categories in a factorization step futher below
 
+        if is_categorical(values):
             # GH23814, for perf, if values._values already an instance of
             # Categorical, set values to codes, and run fastpath
             if (isinstance(values, (ABCSeries, ABCIndexClass)) and
                isinstance(values._values, type(self))):
                 values = values._values.codes.copy()
                 fastpath = True
-        else:
-            # If dtype=None and values is not categorical, create a new dtype
-            dtype = CategoricalDtype(categories, ordered)
-
-        # At this point, dtype is always a CategoricalDtype and you should not
-        # use categories and ordered seperately.
-        # if dtype.categories is None, we are inferring
 
         if fastpath:
             self._codes = coerce_indexer_dtype(values, dtype.categories)
@@ -656,6 +625,9 @@ def from_codes(cls, codes, categories, ordered=False):
             categorical. If not given, the resulting categorical will be
             unordered.
         """
+        dtype = CategoricalDtype._from_values_or_dtype(codes, categories,
+                                                       ordered)
+
         codes = np.asarray(codes)  # #21767
         if not is_integer_dtype(codes):
             msg = "codes need to be array-like integers"
@@ -675,14 +647,12 @@ def from_codes(cls, codes, categories, ordered=False):
             raise ValueError(
                 "codes need to be convertible to an arrays of integers")
 
-        categories = CategoricalDtype.validate_categories(categories)
-
-        if len(codes) and (codes.max() >= len(categories) or codes.min() < -1):
+        if len(codes) and (
+                codes.max() >= len(dtype.categories) or codes.min() < -1):
             raise ValueError("codes need to be between -1 and "
                              "len(categories)-1")
 
-        return cls(codes, categories=categories, ordered=ordered,
-                   fastpath=True)
+        return cls(codes, dtype=dtype, fastpath=True)
 
     _codes = None
 

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -13,6 +13,7 @@
 from pandas import compat
 
 from .base import ExtensionDtype, _DtypeOpsMixin
+from .inference import is_list_like
 
 
 def register_extension_dtype(cls):
@@ -240,6 +241,90 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None):
             ordered = dtype.ordered
         return cls(categories, ordered)
 
+    @classmethod
+    def _from_values_or_dtype(cls, values=None, categories=None, ordered=None,
+                              dtype=None):
+        """
+        Construct dtype from the input parameters used in :class:`Categorical`.
+
+        This constructor method specifically does not do the factorization
+        step, if that is needed to find the categories. This constructor may
+        therefore return ``CategoricalDtype(categories=None, ordered=None)``,
+        which may not be useful. Additional steps may therefore have to be
+        taken to create the final dtype.
+
+        The return dtype is specified from the inputs in this prioritized
+        order:
+        1. if dtype is a CategoricalDtype, return dtype
+        2. if dtype is the string 'category', create a CategoricalDtype from
+           the supplied categories and ordered parameters, and return that.
+        3. if values is a categorical, use value.dtype, but override it with
+           categories and ordered if either/both of those are not None.
+        4. if dtype is None and values is not a categorical, construct the
+           dtype from categories and ordered, even if either of those is None.
+
+        Parameters
+        ----------
+        values : list-like, optional
+            The list-like must be 1-dimensional.
+        categories : list-like, optional
+            Categories for the CategoricalDtype.
+        ordered : bool, optional
+            Designating if the categories are ordered.
+        dtype : CategoricalDtype or the string "category", optional
+            If ``CategoricalDtype``, cannot be used together with
+            `categories` or `ordered`.
+
+        Returns
+        -------
+        CategoricalDtype
+
+        Examples
+        --------
+        >>> CategoricalDtype._from_values_or_dtype()
+        CategoricalDtype(categories=None, ordered=None)
+        >>> CategoricalDtype._from_values_or_dtype(categories=['a', 'b'],
+        ...                                        ordered=True)
+        CategoricalDtype(categories=['a', 'b'], ordered=True)
+        >>> dtype1 = CategoricalDtype(['a', 'b'], ordered=True)
+        >>> dtype2 = CategoricalDtype(['x', 'y'], ordered=False)
+        >>> c = Categorical([0, 1], dtype=dtype1, fastpath=True)
+        >>> CategoricalDtype._from_values_or_dtype(c, ['x', 'y'], ordered=True,
+        ...                                        dtype=dtype2)
+        ValueError: Cannot specify `categories` or `ordered` together with
+        `dtype`.
+
+        The supplied dtype takes precedence over values' dtype:
+
+        >>> CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
+        CategoricalDtype(['x', 'y'], ordered=False)
+        """
+        from pandas.core.dtypes.common import is_categorical
+
+        if dtype is not None:
+            # The dtype argument takes precedence over values.dtype (if any)
+            if isinstance(dtype, compat.string_types):
+                if dtype == 'category':
+                    dtype = CategoricalDtype(categories, ordered)
+                else:
+                    msg = "Unknown dtype {dtype!r}"
+                    raise ValueError(msg.format(dtype=dtype))
+            elif categories is not None or ordered is not None:
+                raise ValueError("Cannot specify `categories` or `ordered` "
+                                 "together with `dtype`.")
+        elif is_categorical(values):
+            # If no "dtype" was passed, use the one from "values", but honor
+            # the "ordered" and "categories" arguments
+            dtype = values.dtype._from_categorical_dtype(values.dtype,
+                                                         categories, ordered)
+        else:
+            # If dtype=None and values is not categorical, create a new dtype.
+            # Note: This could potentially have categories=None and
+            # ordered=None.
+            dtype = CategoricalDtype(categories, ordered)
+
+        return dtype
+
     def _finalize(self, categories, ordered, fastpath=False):
 
         if ordered is not None:
@@ -408,7 +493,10 @@ def validate_categories(categories, fastpath=False):
         """
         from pandas import Index
 
-        if not isinstance(categories, ABCIndexClass):
+        if not fastpath and not is_list_like(categories):
+            msg = "Parameter 'categories' must be list-like, was {!r}"
+            raise TypeError(msg.format(categories))
+        elif not isinstance(categories, ABCIndexClass):
             categories = Index(categories, tupleize_cols=False)
 
         if not fastpath:

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -107,29 +107,23 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
             if fastpath:
                 return cls._simple_new(data, name=name, dtype=dtype)
 
+        dtype = CategoricalDtype._from_values_or_dtype(data, categories,
+                                                       ordered, dtype)
+
         if name is None and hasattr(data, 'name'):
             name = data.name
 
-        if isinstance(data, ABCCategorical):
-            data = cls._create_categorical(data, categories, ordered,
-                                           dtype)
-        elif isinstance(data, CategoricalIndex):
-            data = data._data
-            data = cls._create_categorical(data, categories, ordered,
-                                           dtype)
-        else:
-
+        if not is_categorical_dtype(data):
             # don't allow scalars
             # if data is None, then categories must be provided
             if is_scalar(data):
                 if data is not None or categories is None:
                     cls._scalar_data_error(data)
                 data = []
-            data = cls._create_categorical(data, categories, ordered,
-                                           dtype)
 
-        if copy:
-            data = data.copy()
+        data = cls._create_categorical(data, dtype=dtype)
+
+        data = data.copy() if copy else data
 
         return cls._simple_new(data, name=name)
 
@@ -159,8 +153,7 @@ def _create_from_codes(self, codes, dtype=None, name=None):
         return CategoricalIndex(cat, name=name)
 
     @classmethod
-    def _create_categorical(cls, data, categories=None, ordered=None,
-                            dtype=None):
+    def _create_categorical(cls, data, dtype=None):
         """
         *this is an internal non-public method*
 
@@ -169,8 +162,6 @@ def _create_categorical(cls, data, categories=None, ordered=None,
         Parameters
         ----------
         data : data for new Categorical
-        categories : optional categories, defaults to existing
-        ordered : optional ordered attribute, defaults to existing
         dtype : CategoricalDtype, defaults to existing
 
         Returns
@@ -182,18 +173,11 @@ def _create_categorical(cls, data, categories=None, ordered=None,
             data = data.values
 
         if not isinstance(data, ABCCategorical):
-            if ordered is None and dtype is None:
-                ordered = False
-            data = Categorical(data, categories=categories, ordered=ordered,
-                               dtype=dtype)
-        else:
-            if categories is not None:
-                data = data.set_categories(categories, ordered=ordered)
-            elif ordered is not None and ordered != data.ordered:
-                data = data.set_ordered(ordered)
-            if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
-                # we want to silently ignore dtype='category'
-                data = data._set_dtype(dtype)
+            return Categorical(data, dtype=dtype)
+
+        if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
+            # we want to silently ignore dtype='category'
+            data = data._set_dtype(dtype)
         return data
 
     @classmethod

diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -368,7 +368,7 @@ def test_constructor_str_category(self, categories, ordered):
         tm.assert_categorical_equal(result, expected)
 
     def test_constructor_str_unknown(self):
-        with pytest.raises(ValueError, match="Unknown `dtype`"):
+        with pytest.raises(ValueError, match="Unknown dtype"):
             Categorical([1, 2], dtype="foo")
 
     def test_constructor_from_categorical_with_dtype(self):

diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
@@ -90,10 +90,40 @@ def test_construction_from_string(self):
             TypeError, lambda: CategoricalDtype.construct_from_string('foo'))
 
     def test_constructor_invalid(self):
-        msg = "CategoricalIndex.* must be called"
+        msg = "Parameter 'categories' must be list-like"
         with pytest.raises(TypeError, match=msg):
             CategoricalDtype("category")
 
+    dtype1 = CategoricalDtype(['a', 'b'], ordered=True)
+    dtype2 = CategoricalDtype(['x', 'y'], ordered=False)
+    c = Categorical([0, 1], dtype=dtype1, fastpath=True)
+
+    @pytest.mark.parametrize('values, categories, ordered, dtype, expected',
+                             [
+                                 [None, None, None, None,
+                                  CategoricalDtype()],
+                                 [None, ['a', 'b'], True, None, dtype1],
+                                 [c, None, None, dtype2, dtype2],
+                                 [c, ['x', 'y'], False, None, dtype2],
+                             ])
+    def test_from_values_or_dtype(
+            self, values, categories, ordered, dtype, expected):
+        result = CategoricalDtype._from_values_or_dtype(values, categories,
+                                                        ordered, dtype)
+        assert result == expected
+
+    @pytest.mark.parametrize('values, categories, ordered, dtype', [
+        [None, ['a', 'b'], True, dtype2],
+        [None, ['a', 'b'], None, dtype2],
+        [None, None, True, dtype2],
+    ])
+    def test_from_values_or_dtype_raises(self, values, categories,
+                                         ordered, dtype):
+        msg = "Cannot specify `categories` or `ordered` together with `dtype`."
+        with pytest.raises(ValueError, match=msg):
+            CategoricalDtype._from_values_or_dtype(values, categories,
+                                                   ordered, dtype)
+
     def test_is_dtype(self):
         assert CategoricalDtype.is_dtype(self.dtype)
         assert CategoricalDtype.is_dtype('category')
@@ -706,7 +736,7 @@ def test_invalid_raises(self):
         with pytest.raises(TypeError, match='ordered'):
             CategoricalDtype(['a', 'b'], ordered='foo')
 
-        with pytest.raises(TypeError, match='collection'):
+        with pytest.raises(TypeError, match="'categories' must be list-like"):
             CategoricalDtype('category')
 
     def test_mixed(self):

diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
@@ -158,7 +158,7 @@ def test_construction_with_categorical_dtype(self):
         tm.assert_index_equal(result, expected, exact=True)
 
         # error when combining categories/ordered and dtype kwargs
-        msg = 'Cannot specify both `dtype` and `categories` or `ordered`.'
+        msg = 'Cannot specify `categories` or `ordered` together with `dtype`.'
         with pytest.raises(ValueError, match=msg):
             CategoricalIndex(data, categories=cats, dtype=dtype)