pandas-dev · h-vetinari · Dec 2, 2018 · Dec 5, 2018 · Dec 5, 2018 · Dec 5, 2018
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -320,6 +320,64 @@ Example:
 See the :ref:`advanced docs on renaming<advanced.index_names>` for more details.
 
 
+.. _whatsnew_0240.enhancements.unique:
+
+Changes to the ``unique``-method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The three related methods :meth:`pandas.unique`, :meth:`Series.unique` and
+:meth:`Index.unique` now support the keyword ``return_inverse``, which, if passed,
+makes the output a tuple where the second component is an object that contains the
+mapping from the indices of the values to their location in the return unique values.
+
+.. ipython:: python
+
+    idx = pd.Index([1, 0, 0, 1])
+    uniques, inverse = idx.unique(return_inverse=True)
+    uniques
+    inverse
+    reconstruct = uniques[inverse]
+    reconstruct.equals(idx)
+
+For :class:`Series`, the ``unique`` method has also gained the ``raw``-keyword, which
+allows to toggle between the behavior before v.0.24 (returning an ``np.ndarray``
+or ``Categorical``), and the future behavior of returning a ``Series``.
+
+.. ipython:: python
+
+    pd.Series([1, 1, 3, 2], name='A').unique(raw=False)
+    pd.Series([1, 1, 3, 2], name='A').unique(raw=True)
+
+The ``return_inverse``-keyword is only available if ``raw=False``, since it is necessary
+to reconstruct both the values and the index of a ``Series`` for an inverse (to illustrate
+that the index is maintained, we pass a non-default index in the example below).
+
+.. ipython:: python
+
+    animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'],
+                        index=[1, 4, 9, 16, 25])
+    animals_unique, inverse = animals.unique(raw=False, return_inverse=True)
+    animals_unique
+    inverse
+
+This can be used to reconstruct the original object from its unique values as follows:
+
+.. ipython:: python
+
+    reconstruct = animals_unique.reindex(inverse)
+    reconstruct
+
+We see that the values of `animals` get reconstructed correctly, but the index does
+not match yet  -- consequently, the last step is to correctly set the index.
+
+
+.. ipython:: python
+
+    reconstruct.index = inverse.index
+    reconstruct
+    reconstruct.equals(animals)
+
+
 .. _whatsnew_0240.enhancements.other:
 
 Other Enhancements
@@ -1103,6 +1161,8 @@ Deprecations
 - :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`)
 - :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`)
 - :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`)
+- :meth:`Series.unique` has deprecated returning an array and will return a Series in the future. The behavior can be controlled by the ``raw``-keyword.
+  The recommended method to get an array is to pass `raw=False` and use `.array` on the result.
 - :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`)
 - The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`)
 - :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -271,7 +271,7 @@ def match(to_match, values, na_sentinel=-1):
     return result
 
 
-def unique(values):
+def unique(values, return_inverse=False):
     """
     Hash table-based unique. Uniques are returned in order
     of appearance. This does NOT sort.
@@ -347,15 +347,22 @@ def unique(values):
 
     values = _ensure_arraylike(values)
 
-    if is_extension_array_dtype(values):
+    if isinstance(values, ABCSeries):
+        # this calls through Series, need raw=True to not raise warning
+        return values.unique(raw=True)
+    elif is_extension_array_dtype(values):
         # Dispatch to extension dtype's unique.
         return values.unique()
 
     original = values
     htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
 
     table = htable(len(values))
-    uniques = table.unique(values)
+    if return_inverse:
+        uniques, inverse = table.unique(values, return_inverse=True)
+    else:
+        uniques = table.unique(values)
+
     uniques = _reconstruct_data(uniques, dtype, original)
 
     if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
@@ -365,6 +372,8 @@ def unique(values):
         # TODO: it must return DatetimeArray with tz in pandas 2.0
         uniques = uniques.astype(object).values
 
+    if return_inverse:
+        return uniques, inverse
     return uniques
 
 

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2249,7 +2249,7 @@ def mode(self, dropna=True):
         codes = sorted(htable.mode_int64(ensure_int64(codes), dropna))
         return self._constructor(values=codes, dtype=self.dtype, fastpath=True)
 
-    def unique(self):
+    def unique(self, return_inverse=False):
         """
         Return the ``Categorical`` which ``categories`` and ``codes`` are
         unique. Unused categories are NOT returned.
@@ -2259,9 +2259,22 @@ def unique(self):
         - ordered category: values are sorted by appearance order, categories
           keeps existing order.
 
+        Parameters
+        ----------
+        return_inverse : boolean, default False
+            Whether to return the inverse of the unique values. If True, the
+            output will be a tuple where the second component is again an
+            np.ndarray that contains the mapping between the indices of the
+            elements in the calling Categorical and their locations in the
+            unique values. See examples for how to reconstruct.
+
+            .. versionadded:: 0.24.0
+
         Returns
         -------
-        unique values : ``Categorical``
+        uniques : ``Categorical``
+        inverse : np.ndarray (if `return_inverse=True`)
+            The inverse from the `uniques` back to the calling ``Categorical``.
 
         Examples
         --------
@@ -2293,7 +2306,10 @@ def unique(self):
         """
 
         # unlike np.unique, unique1d does not sort
-        unique_codes = unique1d(self.codes)
+        if return_inverse:
+            unique_codes, inverse = unique1d(self.codes, return_inverse=True)
+        else:
+            unique_codes = unique1d(self.codes, return_inverse=False)
         cat = self.copy()
 
         # keep nan in codes
@@ -2303,7 +2319,11 @@ def unique(self):
         take_codes = unique_codes[unique_codes != -1]
         if self.ordered:
             take_codes = np.sort(take_codes)
-        return cat.set_categories(cat.categories.take(take_codes))
+        result = cat.set_categories(cat.categories.take(take_codes))
+
+        if return_inverse:
+            return result, inverse
+        return result
 
     def _values_for_factorize(self):
         codes = self.codes.astype('int64')

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1208,15 +1208,24 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
                               normalize=normalize, bins=bins, dropna=dropna)
         return result
 
-    def unique(self):
+    def unique(self, return_inverse=False):
         values = self._values
 
-        if hasattr(values, 'unique'):
-
-            result = values.unique()
+        if is_extension_array_dtype(values):
+            if return_inverse:
+                # as long as return_inverse is not part of the EA.unique
+                # contract, test if this works
+                try:
+                    result = values.unique(return_inverse=return_inverse)
+                except TypeError:
+                    raise ValueError('extension array of dtype {dtype} does '
+                                     'not yet support unique with '
+                                     'return_inverse.')
+            else:
+                result = values.unique()
         else:
             from pandas.core.algorithms import unique1d
-            result = unique1d(values)
+            result = unique1d(values, return_inverse=return_inverse)
 
         return result
 
@@ -1235,7 +1244,10 @@ def nunique(self, dropna=True):
         -------
         nunique : int
         """
-        uniqs = self.unique()
+        if isinstance(self, ABCSeries):
+            uniqs = self.unique(raw=True)
+        else:
+            uniqs = self.unique()
         n = len(uniqs)
         if dropna and isna(uniqs).any():
             n -= 1

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -1985,9 +1985,21 @@ def dropna(self, how='any'):
 
             .. versionadded:: 0.23.0
 
+        return_inverse : boolean, default False
+            Whether to return the inverse of the unique values. If True, the
+            output will be a tuple where the second component is again an
+            np.ndarray that contains the mapping between the indices of the
+            elements in the calling Categorical and their locations in the
+            unique values. See examples for how to reconstruct.
+
+            .. versionadded:: 0.24.0
+
         Returns
         -------
-        Index without duplicates
+        uniques : Index
+            The ``Index`` without duplicates
+        inverse : np.ndarray (if `return_inverse=True`)
+            The inverse from the `uniques` back to the calling ``Index``.
 
         See Also
         --------
@@ -1996,9 +2008,14 @@ def dropna(self, how='any'):
         """)
 
     @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
-    def unique(self, level=None):
+    def unique(self, level=None, return_inverse=False):
         if level is not None:
             self._validate_index_level(level)
+
+        if return_inverse:
+            result, inverse = super(Index, self).unique(return_inverse=True)
+            return self._shallow_copy(result), inverse
+
         result = super(Index, self).unique()
         return self._shallow_copy(result)