Merge pull request pandas-dev#3599 from jreback/groupby_output

BUG: Add squeeze keyword to groupby to allow reduction in returned type
PKEuS · May 15, 2013 · e82003f · e82003f
2 parents f34de9e + 65abb6b
commit e82003f
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 14 deletions.
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -65,6 +65,9 @@ pandas 0.11.1
     ``timedelta64[ns]`` to ``object/int`` (GH3425_)
   - Do not allow datetimelike/timedeltalike creation except with valid types
     (e.g. cannot pass ``datetime64[ms]``) (GH3423_)
+  - Add ``squeeze`` keyword to ``groupby`` to allow reduction from 
+    DataFrame -> Series if groups are unique. Regression from 0.10.1, 
+    partial revert on (GH2893_) with (GH3596_)
 
 **Bug Fixes**
 
@@ -161,6 +164,7 @@ pandas 0.11.1
 .. _GH3594: https://github.com/pydata/pandas/issues/3594
 .. _GH3590: https://github.com/pydata/pandas/issues/3590
 .. _GH3610: https://github.com/pydata/pandas/issues/3610
+.. _GH3596: https://github.com/pydata/pandas/issues/3596
 .. _GH3435: https://github.com/pydata/pandas/issues/3435
 
 

diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt
@@ -21,6 +21,26 @@ API changes
         p / p
         p / 0
 
+  - Add ``squeeze`` keyword to ``groupby`` to allow reduction from 
+    DataFrame -> Series if groups are unique. This is a Regression from 0.10.1.
+    We are reverting back to the prior behavior. This means groupby will return the 
+    same shaped objects whether the groups are unique or not. revert on (GH2893_) 
+    with (GH3596_).
+
+    .. ipython:: python
+
+        df2 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19}, 
+                         {"val1":1, "val2": 27}, {"val1":1, "val2": 12}])
+        def func(dataf):
+            return dataf["val2"]  - dataf["val2"].mean()
+
+        # squeezing the result frame to a series (because we have unique groups)
+        df2.groupby("val1", squeeze=True).apply(func)
+
+        # no squeezing (the default, and behavior in 0.10.1)
+        df2.groupby("val1").apply(func)
+
+
 Enhancements
 ~~~~~~~~~~~~
   - ``pd.read_html()`` can now parse HTML string, files or urls and return dataframes
@@ -44,5 +64,7 @@ on GitHub for a complete list.
 .. _GH3477: https://github.com/pydata/pandas/issues/3477
 .. _GH3492: https://github.com/pydata/pandas/issues/3492
 .. _GH3499: https://github.com/pydata/pandas/issues/3499
+.. _GH2893: https://github.com/pydata/pandas/issues/2893
+.. _GH3596: https://github.com/pydata/pandas/issues/3596
 .. _GH3590: https://github.com/pydata/pandas/issues/3590
 .. _GH3435: https://github.com/pydata/pandas/issues/3435
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -107,7 +107,7 @@ def get(self, key, default=None):
             return default
 
     def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
-                group_keys=True):
+                group_keys=True, squeeze=False):
         """
         Group series using mapper (dict or key function, apply given function
         to group, return result as series) or by a series of columns
@@ -131,6 +131,9 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
             Sort group keys. Get better performance by turning this off
         group_keys : boolean, default True
             When calling apply, add group keys to index to identify pieces
+        squeeze : boolean, default False
+            reduce the dimensionaility of the return type if possible, otherwise
+            return a consistent type
 
         Examples
         --------
@@ -150,7 +153,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
         from pandas.core.groupby import groupby
         axis = self._get_axis_number(axis)
         return groupby(self, by, axis=axis, level=level, as_index=as_index,
-                       sort=sort, group_keys=group_keys)
+                       sort=sort, group_keys=group_keys,
+                       squeeze=squeeze)
 
     def asfreq(self, freq, method=None, how=None, normalize=False):
         """

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -169,7 +169,7 @@ class GroupBy(object):
 
     def __init__(self, obj, keys=None, axis=0, level=None,
                  grouper=None, exclusions=None, selection=None, as_index=True,
-                 sort=True, group_keys=True):
+                 sort=True, group_keys=True, squeeze=False):
         self._selection = selection
 
         if isinstance(obj, NDFrame):
@@ -189,6 +189,7 @@ def __init__(self, obj, keys=None, axis=0, level=None,
         self.keys = keys
         self.sort = sort
         self.group_keys = group_keys
+        self.squeeze = squeeze
 
         if grouper is None:
             grouper, exclusions = _get_grouper(obj, keys, axis=axis,
@@ -1841,15 +1842,22 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                     all_indexed_same = _all_indexes_same([x.index for x in values])
                     singular_series  = len(values) == 1 and applied_index.nlevels == 1
 
-                    # assign the name to this series
-                    if singular_series:
-                        values[0].name = keys[0]
+                    # GH3596
+                    # provide a reduction (Frame -> Series) if groups are unique
+                    if self.squeeze:
 
-                    # GH2893
-                    # we have series in the values array, we want to produce a series:
-                    # if any of the sub-series are not indexed the same
-                    # OR we don't have a multi-index and we have only a single values
-                    if singular_series or not all_indexed_same:
+                        # assign the name to this series
+                        if singular_series:
+                            values[0].name = keys[0]
+
+                            # GH2893
+                            # we have series in the values array, we want to produce a series:
+                            # if any of the sub-series are not indexed the same
+                            # OR we don't have a multi-index and we have only a single values
+                            return self._concat_objects(keys, values,
+                                                        not_indexed_same=not_indexed_same)
+
+                    if not all_indexed_same:
                         return self._concat_objects(keys, values,
                                                     not_indexed_same=not_indexed_same)
 

diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -263,24 +263,29 @@ def test_groupby_nonobject_dtype(self):
 
     def test_groupby_return_type(self):
 
-        # GH2893
+        # GH2893, return a reduced type
         df1 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19}, 
                          {"val1":2, "val2": 27}, {"val1":2, "val2": 12}])
 
         def func(dataf):
             return dataf["val2"]  - dataf["val2"].mean()
 
-        result = df1.groupby("val1").apply(func)
+        result = df1.groupby("val1", squeeze=True).apply(func)
         self.assert_(isinstance(result,Series))
 
         df2 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19}, 
                          {"val1":1, "val2": 27}, {"val1":1, "val2": 12}])
         def func(dataf):
             return dataf["val2"]  - dataf["val2"].mean()
 
-        result = df2.groupby("val1").apply(func)
+        result = df2.groupby("val1", squeeze=True).apply(func)
         self.assert_(isinstance(result,Series))
 
+        # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
+        df = DataFrame([[1,1],[1,1]],columns=['X','Y'])
+        result = df.groupby('X',squeeze=False).count()
+        self.assert_(isinstance(result,DataFrame))
+
     def test_agg_regression1(self):
         grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
         result = grouped.agg(np.mean)