diff --git a/pandas/core/factor.py b/pandas/core/factor.py index c242f0b5190bf..dcbf455699205 100644 --- a/pandas/core/factor.py +++ b/pandas/core/factor.py @@ -21,7 +21,7 @@ class Factor(object): * labels : ndarray * levels : ndarray """ - def __init__(self, labels, levels): + def __init__(self, labels, levels, name=None): from pandas.core.index import _ensure_index levels = _ensure_index(levels) @@ -30,6 +30,7 @@ def __init__(self, labels, levels): self.labels = labels self.levels = levels + self.name = name @classmethod def from_array(cls, data): @@ -51,9 +52,10 @@ def __len__(self): return len(self.labels) def __repr__(self): - temp = 'Factor:\n%s\nLevels (%d): %s' + temp = 'Factor:%s\n%s\nLevels (%d): %s' values = np.asarray(self) - return temp % (repr(values), len(self.levels), self.levels) + return temp % ('' if self.name is None else self.name, + repr(values), len(self.levels), self.levels) def __getitem__(self, key): if isinstance(key, (int, np.integer)): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index b018cbeef2605..1962ea6ccec53 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2,6 +2,7 @@ import types import numpy as np +from pandas.core.factor import Factor from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index @@ -972,6 +973,17 @@ def __init__(self, index, grouper=None, name=None, level=None, else: if isinstance(self.grouper, (list, tuple)): self.grouper = com._asarray_tuplesafe(self.grouper) + elif isinstance(self.grouper, Factor): + factor = self.grouper + self._was_factor = True + + # Is there any way to avoid this? + self.grouper = np.asarray(factor) + + self._labels = factor.labels + self._group_index = factor.levels + if self.name is None: + self.name = factor.name # no level passed if not isinstance(self.grouper, np.ndarray): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 81f779e0220e8..797328843820b 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -7,7 +7,7 @@ from pandas import bdate_range from pandas.core.index import Index, MultiIndex from pandas.core.common import rands -from pandas.core.frame import DataFrame +from pandas.core.api import Factor, DataFrame from pandas.core.groupby import GroupByError from pandas.core.series import Series from pandas.util.testing import (assert_panel_equal, assert_frame_equal, @@ -1879,6 +1879,33 @@ def test_no_dummy_key_names(self): self.df['B'].values]).sum() self.assert_(result.index.names == [None, None]) + def test_groupby_factor(self): + levels = ['foo', 'bar', 'baz', 'qux'] + labels = np.random.randint(0, 4, size=100) + + factor = Factor(labels, levels, name='myfactor') + + data = DataFrame(np.random.randn(100, 4)) + + result = data.groupby(factor).mean() + + expected = data.groupby(np.asarray(factor)).mean() + expected = expected.reindex(levels) + + assert_frame_equal(result, expected) + self.assert_(result.index.name == factor.name) + + grouped = data.groupby(factor) + desc_result = grouped.describe() + + idx = factor.labels.argsort() + ord_labels = np.asarray(factor).take(idx) + ord_data = data.take(idx) + expected = ord_data.groupby(ord_labels, sort=False).describe() + assert_frame_equal(desc_result, expected) + + + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = map(tuple, df[keys].values) tups = com._asarray_tuplesafe(tups)