Skip to content

Commit

Permalink
Merge pull request pandas-dev#3599 from jreback/groupby_output
Browse files Browse the repository at this point in the history
BUG: Add squeeze keyword to groupby to allow reduction in returned type
  • Loading branch information
jreback committed May 15, 2013
2 parents f34de9e + 65abb6b commit e82003f
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 14 deletions.
4 changes: 4 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ pandas 0.11.1
``timedelta64[ns]`` to ``object/int`` (GH3425_)
- Do not allow datetimelike/timedeltalike creation except with valid types
(e.g. cannot pass ``datetime64[ms]``) (GH3423_)
- Add ``squeeze`` keyword to ``groupby`` to allow reduction from
DataFrame -> Series if groups are unique. Regression from 0.10.1,
partial revert on (GH2893_) with (GH3596_)

**Bug Fixes**

Expand Down Expand Up @@ -161,6 +164,7 @@ pandas 0.11.1
.. _GH3594: https://github.com/pydata/pandas/issues/3594
.. _GH3590: https://github.com/pydata/pandas/issues/3590
.. _GH3610: https://github.com/pydata/pandas/issues/3610
.. _GH3596: https://github.com/pydata/pandas/issues/3596
.. _GH3435: https://github.com/pydata/pandas/issues/3435


Expand Down
22 changes: 22 additions & 0 deletions doc/source/v0.11.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,26 @@ API changes
p / p
p / 0

- Add ``squeeze`` keyword to ``groupby`` to allow reduction from
DataFrame -> Series if groups are unique. This is a Regression from 0.10.1.
We are reverting back to the prior behavior. This means groupby will return the
same shaped objects whether the groups are unique or not. revert on (GH2893_)
with (GH3596_).

.. ipython:: python

df2 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19},
{"val1":1, "val2": 27}, {"val1":1, "val2": 12}])
def func(dataf):
return dataf["val2"] - dataf["val2"].mean()

# squeezing the result frame to a series (because we have unique groups)
df2.groupby("val1", squeeze=True).apply(func)

# no squeezing (the default, and behavior in 0.10.1)
df2.groupby("val1").apply(func)


Enhancements
~~~~~~~~~~~~
- ``pd.read_html()`` can now parse HTML string, files or urls and return dataframes
Expand All @@ -44,5 +64,7 @@ on GitHub for a complete list.
.. _GH3477: https://github.com/pydata/pandas/issues/3477
.. _GH3492: https://github.com/pydata/pandas/issues/3492
.. _GH3499: https://github.com/pydata/pandas/issues/3499
.. _GH2893: https://github.com/pydata/pandas/issues/2893
.. _GH3596: https://github.com/pydata/pandas/issues/3596
.. _GH3590: https://github.com/pydata/pandas/issues/3590
.. _GH3435: https://github.com/pydata/pandas/issues/3435
8 changes: 6 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def get(self, key, default=None):
return default

def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
group_keys=True):
group_keys=True, squeeze=False):
"""
Group series using mapper (dict or key function, apply given function
to group, return result as series) or by a series of columns
Expand All @@ -131,6 +131,9 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
Sort group keys. Get better performance by turning this off
group_keys : boolean, default True
When calling apply, add group keys to index to identify pieces
squeeze : boolean, default False
reduce the dimensionaility of the return type if possible, otherwise
return a consistent type
Examples
--------
Expand All @@ -150,7 +153,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
from pandas.core.groupby import groupby
axis = self._get_axis_number(axis)
return groupby(self, by, axis=axis, level=level, as_index=as_index,
sort=sort, group_keys=group_keys)
sort=sort, group_keys=group_keys,
squeeze=squeeze)

def asfreq(self, freq, method=None, how=None, normalize=False):
"""
Expand Down
26 changes: 17 additions & 9 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ class GroupBy(object):

def __init__(self, obj, keys=None, axis=0, level=None,
grouper=None, exclusions=None, selection=None, as_index=True,
sort=True, group_keys=True):
sort=True, group_keys=True, squeeze=False):
self._selection = selection

if isinstance(obj, NDFrame):
Expand All @@ -189,6 +189,7 @@ def __init__(self, obj, keys=None, axis=0, level=None,
self.keys = keys
self.sort = sort
self.group_keys = group_keys
self.squeeze = squeeze

if grouper is None:
grouper, exclusions = _get_grouper(obj, keys, axis=axis,
Expand Down Expand Up @@ -1841,15 +1842,22 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
all_indexed_same = _all_indexes_same([x.index for x in values])
singular_series = len(values) == 1 and applied_index.nlevels == 1

# assign the name to this series
if singular_series:
values[0].name = keys[0]
# GH3596
# provide a reduction (Frame -> Series) if groups are unique
if self.squeeze:

# GH2893
# we have series in the values array, we want to produce a series:
# if any of the sub-series are not indexed the same
# OR we don't have a multi-index and we have only a single values
if singular_series or not all_indexed_same:
# assign the name to this series
if singular_series:
values[0].name = keys[0]

# GH2893
# we have series in the values array, we want to produce a series:
# if any of the sub-series are not indexed the same
# OR we don't have a multi-index and we have only a single values
return self._concat_objects(keys, values,
not_indexed_same=not_indexed_same)

if not all_indexed_same:
return self._concat_objects(keys, values,
not_indexed_same=not_indexed_same)

Expand Down
11 changes: 8 additions & 3 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,24 +263,29 @@ def test_groupby_nonobject_dtype(self):

def test_groupby_return_type(self):

# GH2893
# GH2893, return a reduced type
df1 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19},
{"val1":2, "val2": 27}, {"val1":2, "val2": 12}])

def func(dataf):
return dataf["val2"] - dataf["val2"].mean()

result = df1.groupby("val1").apply(func)
result = df1.groupby("val1", squeeze=True).apply(func)
self.assert_(isinstance(result,Series))

df2 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19},
{"val1":1, "val2": 27}, {"val1":1, "val2": 12}])
def func(dataf):
return dataf["val2"] - dataf["val2"].mean()

result = df2.groupby("val1").apply(func)
result = df2.groupby("val1", squeeze=True).apply(func)
self.assert_(isinstance(result,Series))

# GH3596, return a consistent type (regression in 0.11 from 0.10.1)
df = DataFrame([[1,1],[1,1]],columns=['X','Y'])
result = df.groupby('X',squeeze=False).count()
self.assert_(isinstance(result,DataFrame))

def test_agg_regression1(self):
grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.agg(np.mean)
Expand Down

0 comments on commit e82003f

Please sign in to comment.