Skip to content

Commit

Permalink
ENH: support for ordered factors in groupby, close #292
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Jun 5, 2012
1 parent 4655828 commit ea1186d
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 4 deletions.
8 changes: 5 additions & 3 deletions pandas/core/factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class Factor(object):
* labels : ndarray
* levels : ndarray
"""
def __init__(self, labels, levels):
def __init__(self, labels, levels, name=None):
from pandas.core.index import _ensure_index

levels = _ensure_index(levels)
Expand All @@ -30,6 +30,7 @@ def __init__(self, labels, levels):

self.labels = labels
self.levels = levels
self.name = name

@classmethod
def from_array(cls, data):
Expand All @@ -51,9 +52,10 @@ def __len__(self):
return len(self.labels)

def __repr__(self):
temp = 'Factor:\n%s\nLevels (%d): %s'
temp = 'Factor:%s\n%s\nLevels (%d): %s'
values = np.asarray(self)
return temp % (repr(values), len(self.levels), self.levels)
return temp % ('' if self.name is None else self.name,
repr(values), len(self.levels), self.levels)

def __getitem__(self, key):
if isinstance(key, (int, np.integer)):
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import types
import numpy as np

from pandas.core.factor import Factor
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
from pandas.core.index import Index, MultiIndex, _ensure_index
Expand Down Expand Up @@ -972,6 +973,17 @@ def __init__(self, index, grouper=None, name=None, level=None,
else:
if isinstance(self.grouper, (list, tuple)):
self.grouper = com._asarray_tuplesafe(self.grouper)
elif isinstance(self.grouper, Factor):
factor = self.grouper
self._was_factor = True

# Is there any way to avoid this?
self.grouper = np.asarray(factor)

self._labels = factor.labels
self._group_index = factor.levels
if self.name is None:
self.name = factor.name

# no level passed
if not isinstance(self.grouper, np.ndarray):
Expand Down
29 changes: 28 additions & 1 deletion pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pandas import bdate_range
from pandas.core.index import Index, MultiIndex
from pandas.core.common import rands
from pandas.core.frame import DataFrame
from pandas.core.api import Factor, DataFrame
from pandas.core.groupby import GroupByError
from pandas.core.series import Series
from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
Expand Down Expand Up @@ -1879,6 +1879,33 @@ def test_no_dummy_key_names(self):
self.df['B'].values]).sum()
self.assert_(result.index.names == [None, None])

def test_groupby_factor(self):
levels = ['foo', 'bar', 'baz', 'qux']
labels = np.random.randint(0, 4, size=100)

factor = Factor(labels, levels, name='myfactor')

data = DataFrame(np.random.randn(100, 4))

result = data.groupby(factor).mean()

expected = data.groupby(np.asarray(factor)).mean()
expected = expected.reindex(levels)

assert_frame_equal(result, expected)
self.assert_(result.index.name == factor.name)

grouped = data.groupby(factor)
desc_result = grouped.describe()

idx = factor.labels.argsort()
ord_labels = np.asarray(factor).take(idx)
ord_data = data.take(idx)
expected = ord_data.groupby(ord_labels, sort=False).describe()
assert_frame_equal(desc_result, expected)



def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
tups = map(tuple, df[keys].values)
tups = com._asarray_tuplesafe(tups)
Expand Down

0 comments on commit ea1186d

Please sign in to comment.