ENH: Allow the groupby by param to handle columns and index levels (G…

…H5677) (#14432)
pandas-dev · Dec 14, 2016 · a8cabb8 · a8cabb8
1 parent abdfa3e
commit a8cabb8
Show file tree

Hide file tree

Showing 5 changed files with 252 additions and 15 deletions.
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -94,11 +94,21 @@ The mapping can be specified many different ways:
   - For DataFrame objects, a string indicating a column to be used to group. Of
     course ``df.groupby('A')`` is just syntactic sugar for
     ``df.groupby(df['A'])``, but it makes life simpler
+  - For DataFrame objects, a string indicating an index level to be used to group.
   - A list of any of the above things
 
 Collectively we refer to the grouping objects as the **keys**. For example,
 consider the following DataFrame:
 
+.. note::
+
+   .. versionadded:: 0.20
+
+   A string passed to ``groupby`` may refer to either a column or an index level.
+   If a string matches both a column name and an index level name then a warning is
+   issued and the column takes precedence. This will result in an ambiguity error
+   in a future version.
+
 .. ipython:: python
 
    df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
@@ -237,17 +247,6 @@ the length of the ``groups`` dict, so it is largely just a convenience:
    gb.aggregate  gb.count      gb.cumprod    gb.dtype      gb.first      gb.groups     gb.hist       gb.max        gb.min        gb.nth        gb.prod       gb.resample   gb.sum        gb.var
    gb.apply      gb.cummax     gb.cumsum     gb.fillna     gb.gender     gb.head       gb.indices    gb.mean       gb.name       gb.ohlc       gb.quantile   gb.size       gb.tail       gb.weight
 
-
-.. ipython:: python
-   :suppress:
-
-   df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
-                             'foo', 'bar', 'foo', 'foo'],
-                      'B' : ['one', 'one', 'two', 'three',
-                             'two', 'two', 'one', 'three'],
-                      'C' : np.random.randn(8),
-                      'D' : np.random.randn(8)})
-
 .. _groupby.multiindex:
 
 GroupBy with MultiIndex
@@ -289,7 +288,9 @@ chosen level:
 
    s.sum(level='second')
 
-Also as of v0.6, grouping with multiple levels is supported.
+.. versionadded:: 0.6
+
+Grouping with multiple levels is supported.
 
 .. ipython:: python
    :suppress:
@@ -306,15 +307,73 @@ Also as of v0.6, grouping with multiple levels is supported.
    s
    s.groupby(level=['first', 'second']).sum()
 
+.. versionadded:: 0.20
+
+Index level names may be supplied as keys.
+
+.. ipython:: python
+
+   s.groupby(['first', 'second']).sum()
+
 More on the ``sum`` function and aggregation later.
 
+Grouping DataFrame with Index Levels and Columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+A DataFrame may be grouped by a combination of columns and index levels by
+specifying the column names as strings and the index levels as ``pd.Grouper``
+objects.
+
+.. ipython:: python
+
+   arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
+             ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
+
+   index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
+
+   df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3],
+                      'B': np.arange(8)},
+                     index=index)
+
+   df
+
+The following example groups ``df`` by the ``second`` index level and
+the ``A`` column.
+
+.. ipython:: python
+
+   df.groupby([pd.Grouper(level=1), 'A']).sum()
+
+Index levels may also be specified by name.
+
+.. ipython:: python
+
+   df.groupby([pd.Grouper(level='second'), 'A']).sum()
+
+.. versionadded:: 0.20
+
+Index level names may be specified as keys directly to ``groupby``.
+
+.. ipython:: python
+
+   df.groupby(['second', 'A']).sum()
+
 DataFrame column selection in GroupBy
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Once you have created the GroupBy object from a DataFrame, for example, you
 might want to do something different for each of the columns. Thus, using
 ``[]`` similar to getting a column from a DataFrame, you can do:
 
+.. ipython:: python
+   :suppress:
+
+   df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
+                             'foo', 'bar', 'foo', 'foo'],
+                      'B' : ['one', 'one', 'two', 'three',
+                             'two', 'two', 'one', 'three'],
+                      'C' : np.random.randn(8),
+                      'D' : np.random.randn(8)})
+
 .. ipython:: python
 
    grouped = df.groupby(['A'])

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -51,6 +51,20 @@ Other enhancements
 - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`)
 
 - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
+- Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names (:issue:`5677`)
+
+.. ipython:: python
+
+   arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
+             ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
+
+   index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
+
+   df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3],
+                      'B': np.arange(8)},
+                     index=index)
+
+   df.groupby(['second', 'A']).sum()
 
 
 - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -4007,7 +4007,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
         Parameters
         ----------
         by : mapping function / list of functions, dict, Series, or tuple /
-            list of column names.
+            list of column names or index level names.
             Called on each element of the object index to determine the groups.
             If a dict or Series is passed, the Series or dict VALUES will be
             used to determine the groups

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -2459,8 +2459,20 @@ def is_in_obj(gpr):
             exclusions.append(name)
 
         elif is_in_axis(gpr):  # df.groupby('name')
-            in_axis, name, gpr = True, gpr, obj[gpr]
-            exclusions.append(name)
+            if gpr in obj:
+                if gpr in obj.index.names:
+                    warnings.warn(
+                        ("'%s' is both a column name and an index level.\n"
+                         "Defaulting to column but "
+                         "this will raise an ambiguity error in a "
+                         "future version") % gpr,
+                        FutureWarning, stacklevel=2)
+                in_axis, name, gpr = True, gpr, obj[gpr]
+                exclusions.append(name)
+            elif gpr in obj.index.names:
+                in_axis, name, level, gpr = False, None, gpr, None
+            else:
+                raise KeyError(gpr)
         elif isinstance(gpr, Grouper) and gpr.key is not None:
             # Add key to exclusions
             exclusions.append(gpr.key)

diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -521,6 +521,158 @@ def test_grouper_column_and_index(self):
         expected = df_single.reset_index().groupby(['inner', 'B']).mean()
         assert_frame_equal(result, expected)
 
+    def test_grouper_index_level_as_string(self):
+        # GH 5677, allow strings passed as the `by` parameter to reference
+        # columns or index levels
+
+        idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
+                                         ('b', 1), ('b', 2), ('b', 3)])
+        idx.names = ['outer', 'inner']
+        df_multi = pd.DataFrame({"A": np.arange(6),
+                                 'B': ['one', 'one', 'two',
+                                       'two', 'one', 'one']},
+                                index=idx)
+
+        df_single = df_multi.reset_index('outer')
+
+        # Column and Index on MultiIndex
+        result = df_multi.groupby(['B', 'inner']).mean()
+        expected = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean()
+        assert_frame_equal(result, expected)
+
+        # Index and Column on MultiIndex
+        result = df_multi.groupby(['inner', 'B']).mean()
+        expected = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean()
+        assert_frame_equal(result, expected)
+
+        # Column and Index on single Index
+        result = df_single.groupby(['B', 'inner']).mean()
+        expected = df_single.groupby(['B', pd.Grouper(level='inner')]).mean()
+        assert_frame_equal(result, expected)
+
+        # Index and Column on single Index
+        result = df_single.groupby(['inner', 'B']).mean()
+        expected = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean()
+        assert_frame_equal(result, expected)
+
+        # Single element list of Index on MultiIndex
+        result = df_multi.groupby(['inner']).mean()
+        expected = df_multi.groupby(pd.Grouper(level='inner')).mean()
+        assert_frame_equal(result, expected)
+
+        # Single element list of Index on single Index
+        result = df_single.groupby(['inner']).mean()
+        expected = df_single.groupby(pd.Grouper(level='inner')).mean()
+        assert_frame_equal(result, expected)
+
+        # Index on MultiIndex
+        result = df_multi.groupby('inner').mean()
+        expected = df_multi.groupby(pd.Grouper(level='inner')).mean()
+        assert_frame_equal(result, expected)
+
+        # Index on single Index
+        result = df_single.groupby('inner').mean()
+        expected = df_single.groupby(pd.Grouper(level='inner')).mean()
+        assert_frame_equal(result, expected)
+
+    def test_grouper_column_index_level_precedence(self):
+        # GH 5677, when a string passed as the `by` parameter
+        # matches a column and an index level the column takes
+        # precedence
+
+        idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
+                                         ('b', 1), ('b', 2), ('b', 3)])
+        idx.names = ['outer', 'inner']
+        df_multi_both = pd.DataFrame({"A": np.arange(6),
+                                      'B': ['one', 'one', 'two',
+                                            'two', 'one', 'one'],
+                                      'inner': [1, 1, 1, 1, 1, 1]},
+                                     index=idx)
+
+        df_single_both = df_multi_both.reset_index('outer')
+
+        # Group MultiIndex by single key
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df_multi_both.groupby('inner').mean()
+
+        expected = df_multi_both.groupby([pd.Grouper(key='inner')]).mean()
+        assert_frame_equal(result, expected)
+        not_expected = df_multi_both.groupby(pd.Grouper(level='inner')).mean()
+        self.assertFalse(result.index.equals(not_expected.index))
+
+        # Group single Index by single key
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df_single_both.groupby('inner').mean()
+
+        expected = df_single_both.groupby([pd.Grouper(key='inner')]).mean()
+        assert_frame_equal(result, expected)
+        not_expected = df_single_both.groupby(pd.Grouper(level='inner')).mean()
+        self.assertFalse(result.index.equals(not_expected.index))
+
+        # Group MultiIndex by single key list
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df_multi_both.groupby(['inner']).mean()
+
+        expected = df_multi_both.groupby([pd.Grouper(key='inner')]).mean()
+        assert_frame_equal(result, expected)
+        not_expected = df_multi_both.groupby(pd.Grouper(level='inner')).mean()
+        self.assertFalse(result.index.equals(not_expected.index))
+
+        # Group single Index by single key list
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df_single_both.groupby(['inner']).mean()
+
+        expected = df_single_both.groupby([pd.Grouper(key='inner')]).mean()
+        assert_frame_equal(result, expected)
+        not_expected = df_single_both.groupby(pd.Grouper(level='inner')).mean()
+        self.assertFalse(result.index.equals(not_expected.index))
+
+        # Group MultiIndex by two keys (1)
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df_multi_both.groupby(['B', 'inner']).mean()
+
+        expected = df_multi_both.groupby(['B',
+                                          pd.Grouper(key='inner')]).mean()
+        assert_frame_equal(result, expected)
+        not_expected = df_multi_both.groupby(['B',
+                                              pd.Grouper(level='inner')
+                                              ]).mean()
+        self.assertFalse(result.index.equals(not_expected.index))
+
+        # Group MultiIndex by two keys (2)
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df_multi_both.groupby(['inner', 'B']).mean()
+
+        expected = df_multi_both.groupby([pd.Grouper(key='inner'),
+                                          'B']).mean()
+        assert_frame_equal(result, expected)
+        not_expected = df_multi_both.groupby([pd.Grouper(level='inner'),
+                                              'B']).mean()
+        self.assertFalse(result.index.equals(not_expected.index))
+
+        # Group single Index by two keys (1)
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df_single_both.groupby(['B', 'inner']).mean()
+
+        expected = df_single_both.groupby(['B',
+                                           pd.Grouper(key='inner')]).mean()
+        assert_frame_equal(result, expected)
+        not_expected = df_single_both.groupby(['B',
+                                               pd.Grouper(level='inner')
+                                               ]).mean()
+        self.assertFalse(result.index.equals(not_expected.index))
+
+        # Group single Index by two keys (2)
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = df_single_both.groupby(['inner', 'B']).mean()
+
+        expected = df_single_both.groupby([pd.Grouper(key='inner'),
+                                           'B']).mean()
+        assert_frame_equal(result, expected)
+        not_expected = df_single_both.groupby([pd.Grouper(level='inner'),
+                                               'B']).mean()
+        self.assertFalse(result.index.equals(not_expected.index))
+
     def test_grouper_getting_correct_binner(self):
 
         # GH 10063