diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index d782e3d6858a4..9d2dea3aeb796 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1786,6 +1786,7 @@ Groupby/Resample/Rolling - Bug in :meth:`DataFrame.groupby` did not respect the ``observed`` argument when selecting a column and instead always used ``observed=False`` (:issue:`23970`) - Bug in :func:`pandas.core.groupby.SeriesGroupBy.pct_change` or :func:`pandas.core.groupby.DataFrameGroupBy.pct_change` would previously work across groups when calculating the percent change, where it now correctly works per group (:issue:`21200`, :issue:`21235`). - Bug preventing hash table creation with very large number (2^32) of rows (:issue:`22805`) +- Bug in groupby when grouping on categorical causes ``ValueError`` and incorrect grouping if ``observed=True`` and ``nan`` is present in categorical column (:issue:`24740`, :issue:`21151`). Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 633a1643f6cdd..260417bc0d598 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -299,6 +299,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self._labels = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) + codes = codes[codes != -1] else: codes = np.arange(len(categories)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 144b64025e1c0..e118135ccc75d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -420,6 +420,39 @@ def test_observed_groups(observed): tm.assert_dict_equal(result, expected) +def test_observed_groups_with_nan(observed): + # GH 24740 + df = pd.DataFrame({'cat': pd.Categorical(['a', np.nan, 'a'], + categories=['a', 'b', 'd']), + 'vals': [1, 2, 3]}) + g = df.groupby('cat', observed=observed) + result = g.groups + if observed: + expected = {'a': Index([0, 2], dtype='int64')} + else: + expected = {'a': Index([0, 2], dtype='int64'), + 'b': Index([], dtype='int64'), + 'd': Index([], dtype='int64')} + tm.assert_dict_equal(result, expected) + + +def test_dataframe_categorical_with_nan(observed): + # GH 21151 + s1 = pd.Categorical([np.nan, 'a', np.nan, 'a'], + categories=['a', 'b', 'c']) + s2 = pd.Series([1, 2, 3, 4]) + df = pd.DataFrame({'s1': s1, 's2': s2}) + result = df.groupby('s1', observed=observed).first().reset_index() + if observed: + expected = DataFrame({'s1': pd.Categorical(['a'], + categories=['a', 'b', 'c']), 's2': [2]}) + else: + expected = DataFrame({'s1': pd.Categorical(['a', 'b', 'c'], + categories=['a', 'b', 'c']), + 's2': [2, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) + + def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4)