BUG: DataFrameGroupBy.value_counts includes non-observed categories…

… of non-grouping columns (#46798)
pandas-dev · Aug 7, 2022 · d924d0b · d924d0b
1 parent 85738b3
commit d924d0b
Show file tree

Hide file tree

Showing 3 changed files with 364 additions and 29 deletions.
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -393,6 +393,48 @@ upon serialization. (Related issue :issue:`12997`)
     # Roundtripping now works
     pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
 
+
+.. _whatsnew_150.notable_bug_fixes.groupby_value_counts_categorical:
+
+DataFrameGroupBy.value_counts with non-grouping categorical columns and ``observed=True``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Calling :meth:`.DataFrameGroupBy.value_counts` with ``observed=True`` would incorrectly drop non-observed categories of non-grouping columns (:issue:`46357`).
+
+.. code-block:: ipython
+
+    In [6]: df = pd.DataFrame(["a", "b", "c"], dtype="category").iloc[0:2]
+    In [7]: df
+    Out[7]:
+       0
+    0  a
+    1  b
+
+*Old Behavior*
+
+.. code-block:: ipython
+
+    In [8]: df.groupby(level=0, observed=True).value_counts()
+    Out[8]:
+    0  a    1
+    1  b    1
+    dtype: int64
+
+
+*New Behavior*
+
+.. code-block:: ipython
+
+    In [9]: df.groupby(level=0, observed=True).value_counts()
+    Out[9]:
+    0  a    1
+    1  a    0
+       b    1
+    0  b    0
+       c    0
+    1  c    0
+    dtype: int64
+
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.api_breaking:
 
@@ -820,9 +862,8 @@ Bug fixes
 
 Categorical
 ^^^^^^^^^^^
-- Bug in :meth:`.Categorical.view` not accepting integer dtypes (:issue:`25464`)
-- Bug in :meth:`.CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`)
--
+- Bug in :meth:`Categorical.view` not accepting integer dtypes (:issue:`25464`)
+- Bug in :meth:`CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`)
 
 Datetimelike
 ^^^^^^^^^^^^

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -70,6 +70,7 @@
     reconstruct_func,
     validate_func_kwargs,
 )
+from pandas.core.arrays.categorical import Categorical
 import pandas.core.common as com
 from pandas.core.construction import create_series_with_explicit_dtype
 from pandas.core.frame import DataFrame
@@ -87,6 +88,7 @@
     MultiIndex,
     all_indexes_same,
 )
+from pandas.core.indexes.category import CategoricalIndex
 from pandas.core.series import Series
 from pandas.core.shared_docs import _shared_docs
 from pandas.core.util.numba_ import maybe_use_numba
@@ -1824,6 +1826,7 @@ def value_counts(
                     key=key,
                     axis=self.axis,
                     sort=self.sort,
+                    observed=False,
                     dropna=dropna,
                 )
                 groupings += list(grouper.groupings)
@@ -1837,6 +1840,19 @@ def value_counts(
             )
             result_series = cast(Series, gb.size())
 
+            # GH-46357 Include non-observed categories
+            # of non-grouping columns regardless of `observed`
+            if any(
+                isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
+                and not grouping._observed
+                for grouping in groupings
+            ):
+                levels_list = [ping.result_index for ping in groupings]
+                multi_index, _ = MultiIndex.from_product(
+                    levels_list, names=[ping.name for ping in groupings]
+                ).sortlevel()
+                result_series = result_series.reindex(multi_index, fill_value=0)
+
             if normalize:
                 # Normalize the results by dividing by the original group sizes.
                 # We are guaranteed to have the first N levels be the
@@ -1847,12 +1863,13 @@ def value_counts(
                 indexed_group_size = result_series.groupby(
                     result_series.index.droplevel(levels),
                     sort=self.sort,
-                    observed=self.observed,
                     dropna=self.dropna,
                 ).transform("sum")
-
                 result_series /= indexed_group_size
 
+                # Handle groups of non-observed categories
+                result_series = result_series.fillna(0.0)
+
             if sort:
                 # Sort the values and then resort by the main grouping
                 index_level = range(len(self.grouper.groupings))