From 5efa7ecd8c6453d06cd3e2d411345466fce7d929 Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Sat, 23 Dec 2017 16:29:58 -0600 Subject: [PATCH 01/13] Restructure tests --- pandas/tests/groupby/test_aggregate.py | 1166 ++++++++++++------------ 1 file changed, 601 insertions(+), 565 deletions(-) diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index cca21fddd116e..3306902209b06 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -67,123 +67,20 @@ def setup_method(self, method): 'E': np.random.randn(11), 'F': np.random.randn(11)}) - def test_agg_api(self): - - # GH 6337 - # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error - # different api for agg when passed custom function with mixed frame - - df = DataFrame({'data1': np.random.randn(5), - 'data2': np.random.randn(5), - 'key1': ['a', 'a', 'b', 'b', 'a'], - 'key2': ['one', 'two', 'one', 'two', 'one']}) - grouped = df.groupby('key1') - - def peak_to_peak(arr): - return arr.max() - arr.min() - - expected = grouped.agg([peak_to_peak]) - expected.columns = ['data1', 'data2'] - result = grouped.agg(peak_to_peak) - assert_frame_equal(result, expected) - def test_agg_regression1(self): grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.agg(np.mean) expected = grouped.mean() assert_frame_equal(result, expected) - def test_agg_datetimes_mixed(self): - data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]] - - df1 = DataFrame({'key': [x[0] for x in data], - 'date': [x[1] for x in data], - 'value': [x[2] for x in data]}) - - data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] - else None, row[2]] for row in data] - - df2 = DataFrame({'key': [x[0] for x in data], - 'date': [x[1] for x in data], - 'value': [x[2] for x in data]}) - - df1['weights'] = df1['value'] / df1['value'].sum() - gb1 = df1.groupby('date').aggregate(np.sum) - - df2['weights'] = df1['value'] / df1['value'].sum() - gb2 = df2.groupby('date').aggregate(np.sum) - - assert (len(gb1) == len(gb2)) - - def test_agg_period_index(self): - from pandas import period_range, PeriodIndex - prng = period_range('2012-1-1', freq='M', periods=3) - df = DataFrame(np.random.randn(3, 2), index=prng) - rs = df.groupby(level=0).sum() - assert isinstance(rs.index, PeriodIndex) - - # GH 3579 - index = period_range(start='1999-01', periods=5, freq='M') - s1 = Series(np.random.rand(len(index)), index=index) - s2 = Series(np.random.rand(len(index)), index=index) - series = [('s1', s1), ('s2', s2)] - df = DataFrame.from_items(series) - grouped = df.groupby(df.index.month) - list(grouped) - - def test_agg_dict_parameter_cast_result_dtypes(self): - # GH 12821 - - df = DataFrame( - {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], - 'time': date_range('1/1/2011', periods=8, freq='H')}) - df.loc[[0, 1, 2, 5], 'time'] = None - - # test for `first` function - exp = df.loc[[0, 3, 4, 6]].set_index('class') - grouped = df.groupby('class') - assert_frame_equal(grouped.first(), exp) - assert_frame_equal(grouped.agg('first'), exp) - assert_frame_equal(grouped.agg({'time': 'first'}), exp) - assert_series_equal(grouped.time.first(), exp['time']) - assert_series_equal(grouped.time.agg('first'), exp['time']) - - # test for `last` function - exp = df.loc[[0, 3, 4, 7]].set_index('class') - grouped = df.groupby('class') - assert_frame_equal(grouped.last(), exp) - assert_frame_equal(grouped.agg('last'), exp) - assert_frame_equal(grouped.agg({'time': 'last'}), exp) - assert_series_equal(grouped.time.last(), exp['time']) - assert_series_equal(grouped.time.agg('last'), exp['time']) - - # count - exp = pd.Series([2, 2, 2, 2], - index=Index(list('ABCD'), name='class'), - name='time') - assert_series_equal(grouped.time.agg(len), exp) - assert_series_equal(grouped.time.size(), exp) - - exp = pd.Series([0, 1, 1, 2], - index=Index(list('ABCD'), name='class'), - name='time') - assert_series_equal(grouped.time.count(), exp) - - def test_agg_cast_results_dtypes(self): - # similar to GH12821 - # xref #11444 - u = [datetime(2015, x + 1, 1) for x in range(12)] - v = list('aaabbbbbbccd') - df = pd.DataFrame({'X': v, 'Y': u}) - - result = df.groupby('X')['Y'].agg(len) - expected = df.groupby('X')['Y'].count() - assert_series_equal(result, expected) - def test_agg_must_agg(self): grouped = self.df.groupby('A')['C'] - pytest.raises(Exception, grouped.agg, lambda x: x.describe()) - pytest.raises(Exception, grouped.agg, lambda x: x.index[:2]) + + with pytest.raises(Exception): + grouped.agg(lambda x: x.describe()) + + with pytest.raises(Exception): + grouped.agg(lambda x: x.index[:2]) def test_agg_ser_multi_key(self): # TODO(wesm): unused @@ -200,15 +97,17 @@ def test_agg_apply_corner(self): assert self.ts.dtype == np.float64 # groupby float64 values results in Float64Index - exp = Series([], dtype=np.float64, index=pd.Index( - [], dtype=np.float64)) + exp = Series([], + dtype=np.float64, + index=pd.Index([], dtype=np.float64)) assert_series_equal(grouped.sum(), exp) assert_series_equal(grouped.agg(np.sum), exp) assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) # DataFrame grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) - exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, + exp_df = DataFrame(columns=self.tsframe.columns, + dtype=float, index=pd.Index([], dtype=np.float64)) assert_frame_equal(grouped.sum(), exp_df, check_names=False) assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) @@ -234,194 +133,6 @@ def test_agg_grouping_is_list_tuple(self): expected = grouped.mean() tm.assert_frame_equal(result, expected) - def test_aggregate_float64_no_int64(self): - # see gh-11199 - df = DataFrame({"a": [1, 2, 3, 4, 5], - "b": [1, 2, 2, 4, 5], - "c": [1, 2, 3, 4, 5]}) - - expected = DataFrame({"a": [1, 2.5, 4, 5]}, - index=[1, 2, 4, 5]) - expected.index.name = "b" - - result = df.groupby("b")[["a"]].mean() - tm.assert_frame_equal(result, expected) - - expected = DataFrame({"a": [1, 2.5, 4, 5], - "c": [1, 2.5, 4, 5]}, - index=[1, 2, 4, 5]) - expected.index.name = "b" - - result = df.groupby("b")[["a", "c"]].mean() - tm.assert_frame_equal(result, expected) - - def test_aggregate_api_consistency(self): - # GH 9052 - # make sure that the aggregates via dict - # are consistent - - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - grouped = df.groupby(['A', 'B']) - c_mean = grouped['C'].mean() - c_sum = grouped['C'].sum() - d_mean = grouped['D'].mean() - d_sum = grouped['D'].sum() - - result = grouped['D'].agg(['sum', 'mean']) - expected = pd.concat([d_sum, d_mean], - axis=1) - expected.columns = ['sum', 'mean'] - assert_frame_equal(result, expected, check_like=True) - - result = grouped.agg([np.sum, np.mean]) - expected = pd.concat([c_sum, - c_mean, - d_sum, - d_mean], - axis=1) - expected.columns = MultiIndex.from_product([['C', 'D'], - ['sum', 'mean']]) - assert_frame_equal(result, expected, check_like=True) - - result = grouped[['D', 'C']].agg([np.sum, np.mean]) - expected = pd.concat([d_sum, - d_mean, - c_sum, - c_mean], - axis=1) - expected.columns = MultiIndex.from_product([['D', 'C'], - ['sum', 'mean']]) - assert_frame_equal(result, expected, check_like=True) - - result = grouped.agg({'C': 'mean', 'D': 'sum'}) - expected = pd.concat([d_sum, - c_mean], - axis=1) - assert_frame_equal(result, expected, check_like=True) - - result = grouped.agg({'C': ['mean', 'sum'], - 'D': ['mean', 'sum']}) - expected = pd.concat([c_mean, - c_sum, - d_mean, - d_sum], - axis=1) - expected.columns = MultiIndex.from_product([['C', 'D'], - ['mean', 'sum']]) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = grouped[['D', 'C']].agg({'r': np.sum, - 'r2': np.mean}) - expected = pd.concat([d_sum, - c_sum, - d_mean, - c_mean], - axis=1) - expected.columns = MultiIndex.from_product([['r', 'r2'], - ['D', 'C']]) - assert_frame_equal(result, expected, check_like=True) - - def test_agg_dict_renaming_deprecation(self): - # 15931 - df = pd.DataFrame({'A': [1, 1, 1, 2, 2], - 'B': range(5), - 'C': range(5)}) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False) as w: - df.groupby('A').agg({'B': {'foo': ['sum', 'max']}, - 'C': {'bar': ['count', 'min']}}) - assert "using a dict with renaming" in str(w[0].message) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - df.groupby('A')[['B', 'C']].agg({'ma': 'max'}) - - with tm.assert_produces_warning(FutureWarning) as w: - df.groupby('A').B.agg({'foo': 'count'}) - assert "using a dict on a Series for aggregation" in str( - w[0].message) - - def test_agg_compat(self): - - # GH 12334 - - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - g = df.groupby(['A', 'B']) - - expected = pd.concat([g['D'].sum(), - g['D'].std()], - axis=1) - expected.columns = MultiIndex.from_tuples([('C', 'sum'), - ('C', 'std')]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = g['D'].agg({'C': ['sum', 'std']}) - assert_frame_equal(result, expected, check_like=True) - - expected = pd.concat([g['D'].sum(), - g['D'].std()], - axis=1) - expected.columns = ['C', 'D'] - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = g['D'].agg({'C': 'sum', 'D': 'std'}) - assert_frame_equal(result, expected, check_like=True) - - def test_agg_nested_dicts(self): - - # API change for disallowing these types of nested dicts - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - g = df.groupby(['A', 'B']) - - def f(): - g.aggregate({'r1': {'C': ['mean', 'sum']}, - 'r2': {'D': ['mean', 'sum']}}) - - pytest.raises(SpecificationError, f) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = g.agg({'C': {'ra': ['mean', 'std']}, - 'D': {'rb': ['mean', 'std']}}) - expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(), - g['D'].std()], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( - 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) - assert_frame_equal(result, expected, check_like=True) - - # same name as the original column - # GH9052 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) - expected = expected.rename(columns={'result1': 'D'}) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = g['D'].agg({'D': np.sum, 'result2': np.mean}) - assert_frame_equal(result, expected, check_like=True) - def test_agg_python_multiindex(self): grouped = self.mframe.groupby(['A', 'B']) @@ -491,169 +202,6 @@ def aggfun(ser): assert isinstance(result, DataFrame) assert len(result) == 0 - def test_agg_item_by_item_raise_typeerror(self): - from numpy.random import randint - - df = DataFrame(randint(10, size=(20, 10))) - - def raiseException(df): - pprint_thing('----------------------------------------') - pprint_thing(df.to_string()) - raise TypeError - - pytest.raises(TypeError, df.groupby(0).agg, raiseException) - - def test_series_agg_multikey(self): - ts = tm.makeTimeSeries() - grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - - result = grouped.agg(np.sum) - expected = grouped.sum() - assert_series_equal(result, expected) - - def test_series_agg_multi_pure_python(self): - data = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - def bad(x): - assert (len(x.base) > 0) - return 'foo' - - result = data.groupby(['A', 'B']).agg(bad) - expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') - assert_frame_equal(result, expected) - - def test_cythonized_aggers(self): - data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], - 'B': ['A', 'B'] * 6, - 'C': np.random.randn(12)} - df = DataFrame(data) - df.loc[2:10:2, 'C'] = nan - - def _testit(name): - - op = lambda x: getattr(x, name)() - - # single column - grouped = df.drop(['B'], axis=1).groupby('A') - exp = {} - for cat, group in grouped: - exp[cat] = op(group['C']) - exp = DataFrame({'C': exp}) - exp.index.name = 'A' - result = op(grouped) - assert_frame_equal(result, exp) - - # multiple columns - grouped = df.groupby(['A', 'B']) - expd = {} - for (cat1, cat2), group in grouped: - expd.setdefault(cat1, {})[cat2] = op(group['C']) - exp = DataFrame(expd).T.stack(dropna=False) - exp.index.names = ['A', 'B'] - exp.name = 'C' - - result = op(grouped)['C'] - if name in ['sum', 'prod']: - assert_series_equal(result, exp) - - _testit('count') - _testit('sum') - _testit('std') - _testit('var') - _testit('sem') - _testit('mean') - _testit('median') - _testit('prod') - _testit('min') - _testit('max') - - def test_cython_agg_boolean(self): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': np.random.randint(0, 2, 50).astype('bool')}) - result = frame.groupby('a')['b'].mean() - expected = frame.groupby('a')['b'].agg(np.mean) - - assert_series_equal(result, expected) - - def test_cython_agg_nothing_to_agg(self): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25}) - pytest.raises(DataError, frame.groupby('a')['b'].mean) - - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25}) - pytest.raises(DataError, frame[['b']].groupby(frame['a']).mean) - - def test_cython_agg_nothing_to_agg_with_dates(self): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25, - 'dates': pd.date_range('now', periods=50, - freq='T')}) - with tm.assert_raises_regex(DataError, - "No numeric types to aggregate"): - frame.groupby('b').dates.mean() - - def test_cython_agg_frame_columns(self): - # #2113 - df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) - - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - - def test_cython_agg_return_dict(self): - # GH 16741 - ts = self.df.groupby('A')['B'].agg( - lambda x: x.value_counts().to_dict()) - expected = Series([{'two': 1, 'one': 1, 'three': 1}, - {'two': 2, 'one': 2, 'three': 1}], - index=Index(['bar', 'foo'], name='A'), - name='B') - assert_series_equal(ts, expected) - - def test_cython_fail_agg(self): - dr = bdate_range('1/1/2000', periods=50) - ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) - - grouped = ts.groupby(lambda x: x.month) - summed = grouped.sum() - expected = grouped.agg(np.sum) - assert_series_equal(summed, expected) - - def test_agg_consistency(self): - # agg with ([]) and () not consistent - # GH 6715 - - def P1(a): - try: - return np.percentile(a.dropna(), q=1) - except Exception: - return np.nan - - import datetime as dt - df = DataFrame({'col1': [1, 2, 3, 4], - 'col2': [10, 25, 26, 31], - 'date': [dt.date(2013, 2, 10), dt.date(2013, 2, 10), - dt.date(2013, 2, 11), dt.date(2013, 2, 11)]}) - - g = df.groupby('date') - - expected = g.agg([P1]) - expected.columns = expected.columns.levels[0] - - result = g.agg(P1) - assert_frame_equal(result, expected) - def test_wrap_agg_out(self): grouped = self.three_group.groupby(['A', 'B']) @@ -770,22 +318,113 @@ def test_multi_function_flexible_mix(self): assert_frame_equal(result, expected) assert_frame_equal(result2, expected) - def test_agg_callables(self): - # GH 7929 - df = DataFrame({'foo': [1, 2], 'bar': [3, 4]}).astype(np.int64) - class fn_class(object): +class TestGroupByAggregateCython(object): + + def test_cythonized_aggers(self): + data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], + 'B': ['A', 'B'] * 6, + 'C': np.random.randn(12)} + df = DataFrame(data) + df.loc[2:10:2, 'C'] = nan - def __call__(self, x): - return sum(x) + def _testit(name): - equiv_callables = [sum, np.sum, lambda x: sum(x), lambda x: x.sum(), - partial(sum), fn_class()] + op = lambda x: getattr(x, name)() - expected = df.groupby("foo").agg(sum) - for ecall in equiv_callables: - result = df.groupby('foo').agg(ecall) - assert_frame_equal(result, expected) + # single column + grouped = df.drop(['B'], axis=1).groupby('A') + exp = {} + for cat, group in grouped: + exp[cat] = op(group['C']) + exp = DataFrame({'C': exp}) + exp.index.name = 'A' + result = op(grouped) + assert_frame_equal(result, exp) + + # multiple columns + grouped = df.groupby(['A', 'B']) + expd = {} + for (cat1, cat2), group in grouped: + expd.setdefault(cat1, {})[cat2] = op(group['C']) + exp = DataFrame(expd).T.stack(dropna=False) + exp.index.names = ['A', 'B'] + exp.name = 'C' + + result = op(grouped)['C'] + if name in ['sum', 'prod']: + assert_series_equal(result, exp) + + _testit('count') + _testit('sum') + _testit('std') + _testit('var') + _testit('sem') + _testit('mean') + _testit('median') + _testit('prod') + _testit('min') + _testit('max') + + def test_cython_agg_boolean(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': np.random.randint(0, 2, 50).astype('bool')}) + result = frame.groupby('a')['b'].mean() + expected = frame.groupby('a')['b'].agg(np.mean) + + assert_series_equal(result, expected) + + def test_cython_agg_nothing_to_agg(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25}) + pytest.raises(DataError, frame.groupby('a')['b'].mean) + + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25}) + pytest.raises(DataError, frame[['b']].groupby(frame['a']).mean) + + def test_cython_agg_nothing_to_agg_with_dates(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25, + 'dates': pd.date_range('now', periods=50, + freq='T')}) + with tm.assert_raises_regex(DataError, + "No numeric types to aggregate"): + frame.groupby('b').dates.mean() + + def test_cython_agg_frame_columns(self): + # #2113 + df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) + + df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis='columns').mean() + + def test_cython_agg_return_dict(self): + # GH 16741 + df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + ts = df.groupby('A')['B'].agg( + lambda x: x.value_counts().to_dict()) + expected = Series([{'two': 1, 'one': 1, 'three': 1}, + {'two': 2, 'one': 2, 'three': 1}], + index=Index(['bar', 'foo'], name='A'), + name='B') + assert_series_equal(ts, expected) + + def test_cython_fail_agg(self): + dr = bdate_range('1/1/2000', periods=50) + ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) + + grouped = ts.groupby(lambda x: x.month) + summed = grouped.sum() + expected = grouped.agg(np.sum) + assert_series_equal(summed, expected) def test__cython_agg_general(self): ops = [('mean', np.mean), @@ -864,98 +503,495 @@ def test_agg_category_nansum(self): name='B') tm.assert_series_equal(result, expected) - def test_agg_over_numpy_arrays(self): - # GH 3788 - df = pd.DataFrame([[1, np.array([10, 20, 30])], - [1, np.array([40, 50, 60])], - [2, np.array([20, 30, 40])]], - columns=['category', 'arraydata']) - result = df.groupby('category').agg(sum) - - expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] - expected_index = pd.Index([1, 2], name='category') - expected_column = ['arraydata'] - expected = pd.DataFrame(expected_data, - index=expected_index, - columns=expected_column) - assert_frame_equal(result, expected) +def test_agg_api(): - def test_agg_timezone_round_trip(self): - # GH 15426 - ts = pd.Timestamp("2016-01-01 12:00:00", tz='US/Pacific') - df = pd.DataFrame({'a': 1, 'b': [ts + timedelta(minutes=nn) - for nn in range(10)]}) - - result1 = df.groupby('a')['b'].agg(np.min).iloc[0] - result2 = df.groupby('a')['b'].agg(lambda x: np.min(x)).iloc[0] - result3 = df.groupby('a')['b'].min().iloc[0] - - assert result1 == ts - assert result2 == ts - assert result3 == ts - - dates = [pd.Timestamp("2016-01-0%d 12:00:00" % i, tz='US/Pacific') - for i in range(1, 5)] - df = pd.DataFrame({'A': ['a', 'b'] * 2, 'B': dates}) - grouped = df.groupby('A') - - ts = df['B'].iloc[0] - assert ts == grouped.nth(0)['B'].iloc[0] - assert ts == grouped.head(1)['B'].iloc[0] - assert ts == grouped.first()['B'].iloc[0] - assert ts == grouped.apply(lambda x: x.iloc[0])[0] - - ts = df['B'].iloc[2] - assert ts == grouped.last()['B'].iloc[0] - assert ts == grouped.apply(lambda x: x.iloc[-1])[0] - - def test_sum_uint64_overflow(self): - # see gh-14758 - - # Convert to uint64 and don't overflow - df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], - dtype=object) + 9223372036854775807 - - index = pd.Index([9223372036854775808, 9223372036854775810, - 9223372036854775812], dtype=np.uint64) - expected = pd.DataFrame({1: [9223372036854775809, - 9223372036854775811, - 9223372036854775813]}, index=index) - - expected.index.name = 0 - result = df.groupby(0).sum() - tm.assert_frame_equal(result, expected) + # GH 6337 + # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error + # different api for agg when passed custom function with mixed frame + + df = DataFrame({'data1': np.random.randn(5), + 'data2': np.random.randn(5), + 'key1': ['a', 'a', 'b', 'b', 'a'], + 'key2': ['one', 'two', 'one', 'two', 'one']}) + grouped = df.groupby('key1') + + def peak_to_peak(arr): + return arr.max() - arr.min() + + expected = grouped.agg([peak_to_peak]) + expected.columns = ['data1', 'data2'] + result = grouped.agg(peak_to_peak) + assert_frame_equal(result, expected) + + +def test_agg_datetimes_mixed(): + data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]] + + df1 = DataFrame({'key': [x[0] for x in data], + 'date': [x[1] for x in data], + 'value': [x[2] for x in data]}) + + data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] + else None, row[2]] for row in data] - @pytest.mark.parametrize("structure, expected", [ - (tuple, pd.DataFrame({'C': {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), - (list, pd.DataFrame({'C': {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), - (lambda x: tuple(x), pd.DataFrame({'C': {(1, 1): (1, 1, 1), - (3, 4): (3, 4, 4)}})), - (lambda x: list(x), pd.DataFrame({'C': {(1, 1): [1, 1, 1], - (3, 4): [3, 4, 4]}})) - ]) - def test_agg_structs_dataframe(self, structure, expected): - df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], - 'B': [1, 1, 1, 4, 4, 4], 'C': [1, 1, 1, 3, 4, 4]}) - - result = df.groupby(['A', 'B']).aggregate(structure) - expected.index.names = ['A', 'B'] + df2 = DataFrame({'key': [x[0] for x in data], + 'date': [x[1] for x in data], + 'value': [x[2] for x in data]}) + + df1['weights'] = df1['value'] / df1['value'].sum() + gb1 = df1.groupby('date').aggregate(np.sum) + + df2['weights'] = df1['value'] / df1['value'].sum() + gb2 = df2.groupby('date').aggregate(np.sum) + + assert (len(gb1) == len(gb2)) + + +def test_agg_period_index(): + from pandas import period_range, PeriodIndex + prng = period_range('2012-1-1', freq='M', periods=3) + df = DataFrame(np.random.randn(3, 2), index=prng) + rs = df.groupby(level=0).sum() + assert isinstance(rs.index, PeriodIndex) + + # GH 3579 + index = period_range(start='1999-01', periods=5, freq='M') + s1 = Series(np.random.rand(len(index)), index=index) + s2 = Series(np.random.rand(len(index)), index=index) + series = [('s1', s1), ('s2', s2)] + df = DataFrame.from_items(series) + grouped = df.groupby(df.index.month) + list(grouped) + + +def test_agg_dict_parameter_cast_result_dtypes(): + # GH 12821 + + df = DataFrame( + {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], + 'time': date_range('1/1/2011', periods=8, freq='H')}) + df.loc[[0, 1, 2, 5], 'time'] = None + + # test for `first` function + exp = df.loc[[0, 3, 4, 6]].set_index('class') + grouped = df.groupby('class') + assert_frame_equal(grouped.first(), exp) + assert_frame_equal(grouped.agg('first'), exp) + assert_frame_equal(grouped.agg({'time': 'first'}), exp) + assert_series_equal(grouped.time.first(), exp['time']) + assert_series_equal(grouped.time.agg('first'), exp['time']) + + # test for `last` function + exp = df.loc[[0, 3, 4, 7]].set_index('class') + grouped = df.groupby('class') + assert_frame_equal(grouped.last(), exp) + assert_frame_equal(grouped.agg('last'), exp) + assert_frame_equal(grouped.agg({'time': 'last'}), exp) + assert_series_equal(grouped.time.last(), exp['time']) + assert_series_equal(grouped.time.agg('last'), exp['time']) + + # count + exp = pd.Series([2, 2, 2, 2], + index=Index(list('ABCD'), name='class'), + name='time') + assert_series_equal(grouped.time.agg(len), exp) + assert_series_equal(grouped.time.size(), exp) + + exp = pd.Series([0, 1, 1, 2], + index=Index(list('ABCD'), name='class'), + name='time') + assert_series_equal(grouped.time.count(), exp) + + +def test_agg_cast_results_dtypes(): + # similar to GH12821 + # xref #11444 + u = [datetime(2015, x + 1, 1) for x in range(12)] + v = list('aaabbbbbbccd') + df = pd.DataFrame({'X': v, 'Y': u}) + + result = df.groupby('X')['Y'].agg(len) + expected = df.groupby('X')['Y'].count() + assert_series_equal(result, expected) + + +def test_aggregate_float64_no_int64(): + # see gh-11199 + df = DataFrame({"a": [1, 2, 3, 4, 5], + "b": [1, 2, 2, 4, 5], + "c": [1, 2, 3, 4, 5]}) + + expected = DataFrame({"a": [1, 2.5, 4, 5]}, + index=[1, 2, 4, 5]) + expected.index.name = "b" + + result = df.groupby("b")[["a"]].mean() + tm.assert_frame_equal(result, expected) + + expected = DataFrame({"a": [1, 2.5, 4, 5], + "c": [1, 2.5, 4, 5]}, + index=[1, 2, 4, 5]) + expected.index.name = "b" + + result = df.groupby("b")[["a", "c"]].mean() + tm.assert_frame_equal(result, expected) + + +def test_aggregate_api_consistency(): + # GH 9052 + # make sure that the aggregates via dict + # are consistent + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) + + grouped = df.groupby(['A', 'B']) + c_mean = grouped['C'].mean() + c_sum = grouped['C'].sum() + d_mean = grouped['D'].mean() + d_sum = grouped['D'].sum() + + result = grouped['D'].agg(['sum', 'mean']) + expected = pd.concat([d_sum, d_mean], + axis=1) + expected.columns = ['sum', 'mean'] + assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg([np.sum, np.mean]) + expected = pd.concat([c_sum, + c_mean, + d_sum, + d_mean], + axis=1) + expected.columns = MultiIndex.from_product([['C', 'D'], + ['sum', 'mean']]) + assert_frame_equal(result, expected, check_like=True) + + result = grouped[['D', 'C']].agg([np.sum, np.mean]) + expected = pd.concat([d_sum, + d_mean, + c_sum, + c_mean], + axis=1) + expected.columns = MultiIndex.from_product([['D', 'C'], + ['sum', 'mean']]) + assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg({'C': 'mean', 'D': 'sum'}) + expected = pd.concat([d_sum, + c_mean], + axis=1) + assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg({'C': ['mean', 'sum'], + 'D': ['mean', 'sum']}) + expected = pd.concat([c_mean, + c_sum, + d_mean, + d_sum], + axis=1) + expected.columns = MultiIndex.from_product([['C', 'D'], + ['mean', 'sum']]) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = grouped[['D', 'C']].agg({'r': np.sum, + 'r2': np.mean}) + expected = pd.concat([d_sum, + c_sum, + d_mean, + c_mean], + axis=1) + expected.columns = MultiIndex.from_product([['r', 'r2'], + ['D', 'C']]) + assert_frame_equal(result, expected, check_like=True) + + +def test_agg_dict_renaming_deprecation(): + # 15931 + df = pd.DataFrame({'A': [1, 1, 1, 2, 2], + 'B': range(5), + 'C': range(5)}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False) as w: + df.groupby('A').agg({'B': {'foo': ['sum', 'max']}, + 'C': {'bar': ['count', 'min']}}) + assert "using a dict with renaming" in str(w[0].message) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + df.groupby('A')[['B', 'C']].agg({'ma': 'max'}) + + with tm.assert_produces_warning(FutureWarning) as w: + df.groupby('A').B.agg({'foo': 'count'}) + assert "using a dict on a Series for aggregation" in str(w[0].message) + + +def test_agg_compat(): + # GH 12334 + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) + + g = df.groupby(['A', 'B']) + + expected = pd.concat([g['D'].sum(), + g['D'].std()], + axis=1) + expected.columns = MultiIndex.from_tuples([('C', 'sum'), + ('C', 'std')]) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = g['D'].agg({'C': ['sum', 'std']}) + assert_frame_equal(result, expected, check_like=True) + + expected = pd.concat([g['D'].sum(), + g['D'].std()], + axis=1) + expected.columns = ['C', 'D'] + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = g['D'].agg({'C': 'sum', 'D': 'std'}) + assert_frame_equal(result, expected, check_like=True) + + +def test_agg_nested_dicts(): + # API change for disallowing these types of nested dicts + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) + + g = df.groupby(['A', 'B']) + + with pytest.raises(SpecificationError): + g.aggregate({'r1': {'C': ['mean', 'sum']}, + 'r2': {'D': ['mean', 'sum']}}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = g.agg({'C': {'ra': ['mean', 'std']}, + 'D': {'rb': ['mean', 'std']}}) + expected = pd.concat([g['C'].mean(), g['C'].std(), + g['D'].mean(), g['D'].std()], + axis=1) + expected.columns = pd.MultiIndex.from_tuples( + [('ra', 'mean'), ('ra', 'std'), + ('rb', 'mean'), ('rb', 'std')]) + assert_frame_equal(result, expected, check_like=True) + + # same name as the original column + # GH9052 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) + expected = expected.rename(columns={'result1': 'D'}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = g['D'].agg({'D': np.sum, 'result2': np.mean}) + assert_frame_equal(result, expected, check_like=True) + + +def test_agg_item_by_item_raise_typeerror(): + from numpy.random import randint + + df = DataFrame(randint(10, size=(20, 10))) + + def raiseException(df): + pprint_thing('----------------------------------------') + pprint_thing(df.to_string()) + raise TypeError + + with pytest.raises(TypeError): + df.groupby(0).agg(raiseException) + + +def test_series_agg_multikey(): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + + result = grouped.agg(np.sum) + expected = grouped.sum() + assert_series_equal(result, expected) + + +def test_series_agg_multi_pure_python(): + data = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def bad(x): + assert (len(x.base) > 0) + return 'foo' + + result = data.groupby(['A', 'B']).agg(bad) + expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') + assert_frame_equal(result, expected) + + +def test_agg_consistency(): + # agg with ([]) and () not consistent + # GH 6715 + + def P1(a): + try: + return np.percentile(a.dropna(), q=1) + except Exception: + return np.nan + + import datetime as dt + df = DataFrame({'col1': [1, 2, 3, 4], + 'col2': [10, 25, 26, 31], + 'date': [dt.date(2013, 2, 10), dt.date(2013, 2, 10), + dt.date(2013, 2, 11), dt.date(2013, 2, 11)]}) + + g = df.groupby('date') + + expected = g.agg([P1]) + expected.columns = expected.columns.levels[0] + + result = g.agg(P1) + assert_frame_equal(result, expected) + + +def test_agg_callables(): + # GH 7929 + df = DataFrame({'foo': [1, 2], 'bar': [3, 4]}).astype(np.int64) + + class fn_class(object): + + def __call__(self, x): + return sum(x) + + equiv_callables = [sum, np.sum, lambda x: sum(x), lambda x: x.sum(), + partial(sum), fn_class()] + + expected = df.groupby("foo").agg(sum) + for ecall in equiv_callables: + result = df.groupby('foo').agg(ecall) assert_frame_equal(result, expected) - @pytest.mark.parametrize("structure, expected", [ - (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name='C')), - (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name='C')), - (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], - index=[1, 3], name='C')), - (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], - index=[1, 3], name='C')) - ]) - def test_agg_structs_series(self, structure, expected): - # Issue #18079 - df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], - 'B': [1, 1, 1, 4, 4, 4], 'C': [1, 1, 1, 3, 4, 4]}) - - result = df.groupby('A')['C'].aggregate(structure) - expected.index.name = 'A' - assert_series_equal(result, expected) + +def test_agg_over_numpy_arrays(): + # GH 3788 + df = pd.DataFrame([[1, np.array([10, 20, 30])], + [1, np.array([40, 50, 60])], + [2, np.array([20, 30, 40])]], + columns=['category', 'arraydata']) + result = df.groupby('category').agg(sum) + + expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] + expected_index = pd.Index([1, 2], name='category') + expected_column = ['arraydata'] + expected = pd.DataFrame(expected_data, + index=expected_index, + columns=expected_column) + + assert_frame_equal(result, expected) + + +def test_agg_timezone_round_trip(): + # GH 15426 + ts = pd.Timestamp("2016-01-01 12:00:00", tz='US/Pacific') + df = pd.DataFrame({'a': 1, 'b': [ts + timedelta(minutes=nn) + for nn in range(10)]}) + + result1 = df.groupby('a')['b'].agg(np.min).iloc[0] + result2 = df.groupby('a')['b'].agg(lambda x: np.min(x)).iloc[0] + result3 = df.groupby('a')['b'].min().iloc[0] + + assert result1 == ts + assert result2 == ts + assert result3 == ts + + dates = [pd.Timestamp("2016-01-0%d 12:00:00" % i, tz='US/Pacific') + for i in range(1, 5)] + df = pd.DataFrame({'A': ['a', 'b'] * 2, 'B': dates}) + grouped = df.groupby('A') + + ts = df['B'].iloc[0] + assert ts == grouped.nth(0)['B'].iloc[0] + assert ts == grouped.head(1)['B'].iloc[0] + assert ts == grouped.first()['B'].iloc[0] + assert ts == grouped.apply(lambda x: x.iloc[0])[0] + + ts = df['B'].iloc[2] + assert ts == grouped.last()['B'].iloc[0] + assert ts == grouped.apply(lambda x: x.iloc[-1])[0] + + +def test_sum_uint64_overflow(): + # see gh-14758 + + # Convert to uint64 and don't overflow + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], + dtype=object) + 9223372036854775807 + + index = pd.Index([9223372036854775808, 9223372036854775810, + 9223372036854775812], dtype=np.uint64) + expected = pd.DataFrame({1: [9223372036854775809, + 9223372036854775811, + 9223372036854775813]}, index=index) + + expected.index.name = 0 + result = df.groupby(0).sum() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("structure, expected", [ + (tuple, pd.DataFrame({'C': {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), + (list, pd.DataFrame({'C': {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), + (lambda x: tuple(x), pd.DataFrame({'C': {(1, 1): (1, 1, 1), + (3, 4): (3, 4, 4)}})), + (lambda x: list(x), pd.DataFrame({'C': {(1, 1): [1, 1, 1], + (3, 4): [3, 4, 4]}})) +]) +def test_agg_structs_dataframe(structure, expected): + df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], + 'B': [1, 1, 1, 4, 4, 4], + 'C': [1, 1, 1, 3, 4, 4]}) + + result = df.groupby(['A', 'B']).aggregate(structure) + expected.index.names = ['A', 'B'] + assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("structure, expected", [ + (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name='C')), + (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name='C')), + (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], + index=[1, 3], name='C')), + (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], + index=[1, 3], name='C')) +]) +def test_agg_structs_series(structure, expected): + # Issue #18079 + df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], + 'B': [1, 1, 1, 4, 4, 4], + 'C': [1, 1, 1, 3, 4, 4]}) + + result = df.groupby('A')['C'].aggregate(structure) + expected.index.name = 'A' + assert_series_equal(result, expected) From 0350e2d07600eb4e1f58221edba8e3719a7e0d27 Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Fri, 29 Dec 2017 19:02:45 -0600 Subject: [PATCH 02/13] Orgnaize new tests --- pandas/tests/groupby/test_aggregate.py | 29 +++++++++++++------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 3306902209b06..5027277ab2f4b 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -489,20 +489,6 @@ def test_cython_agg_empty_buckets_nanops(self): index=pd.CategoricalIndex(intervals, name='a', ordered=True)) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.") - def test_agg_category_nansum(self): - categories = ['a', 'b', 'c'] - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=categories), - 'B': [1, 2, 3]}) - result = df.groupby("A").B.agg(np.nansum) - expected = pd.Series([3, 3, 0], - index=pd.CategoricalIndex(['a', 'b', 'c'], - categories=categories, - name='A'), - name='B') - tm.assert_series_equal(result, expected) - def test_agg_api(): @@ -995,3 +981,18 @@ def test_agg_structs_series(structure, expected): result = df.groupby('A')['C'].aggregate(structure) expected.index.name = 'A' assert_series_equal(result, expected) + + +@pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.") +def test_agg_category_nansum(self): + categories = ['a', 'b', 'c'] + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=categories), + 'B': [1, 2, 3]}) + result = df.groupby("A").B.agg(np.nansum) + expected = pd.Series([3, 3, 0], + index=pd.CategoricalIndex(['a', 'b', 'c'], + categories=categories, + name='A'), + name='B') + tm.assert_series_equal(result, expected) From 3b5fc39c278ddcb325b2f80b5ec13dd786249a4c Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Sat, 23 Dec 2017 16:39:21 -0600 Subject: [PATCH 03/13] Change pytest.raises to pandas test helper --- pandas/tests/groupby/test_aggregate.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 5027277ab2f4b..aa546c8116f03 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -242,7 +242,9 @@ def test_agg_multiple_functions_too_many_lambdas(self): grouped = self.df.groupby('A') funcs = ['mean', lambda x: x.mean(), lambda x: x.std()] - pytest.raises(SpecificationError, grouped.agg, funcs) + msg = 'Function names must be unique, found multiple named ' + with tm.assert_raises_regex(SpecificationError, msg): + grouped.agg(funcs) def test_more_flexible_frame_multi_function(self): @@ -491,7 +493,6 @@ def test_cython_agg_empty_buckets_nanops(self): def test_agg_api(): - # GH 6337 # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame From 5a25b061b0deaa303e56a48e2b7bf5e3af14d21f Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Sun, 24 Dec 2017 12:59:47 -0600 Subject: [PATCH 04/13] Replace pytest.raises with pandas test helper --- pandas/tests/groupby/test_aggregate.py | 40 ++++++++++++++------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index aa546c8116f03..84f3f6afed739 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -76,10 +76,10 @@ def test_agg_regression1(self): def test_agg_must_agg(self): grouped = self.df.groupby('A')['C'] - with pytest.raises(Exception): + msg = "Must produce aggregated value" + with tm.assert_raises_regex(Exception, msg): grouped.agg(lambda x: x.describe()) - - with pytest.raises(Exception): + with tm.assert_raises_regex(Exception, msg): grouped.agg(lambda x: x.index[:2]) def test_agg_ser_multi_key(self): @@ -290,27 +290,26 @@ def test_multi_function_flexible_mix(self): # GH #1268 grouped = self.df.groupby('A') - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], [ - 'bar', 'std' - ]])], ['D', 'sum']]) + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], + ['bar', 'std']])], ['D', 'sum']]) # this uses column selection & renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = grouped.aggregate(d) - d2 = OrderedDict([['C', OrderedDict([['foo', 'mean'], [ - 'bar', 'std' - ]])], ['D', ['sum']]]) + d2 = OrderedDict([['C', OrderedDict([['foo', 'mean'], + ['bar', 'std']])], + ['D', ['sum']]]) # this uses column selection & renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result2 = grouped.aggregate(d2) - d3 = OrderedDict([['C', OrderedDict([['foo', 'mean'], [ - 'bar', 'std' - ]])], ['D', {'sum': 'sum'}]]) + d3 = OrderedDict([['C', OrderedDict([['foo', 'mean'], + ['bar', 'std']])], + ['D', {'sum': 'sum'}]]) # this uses column selection & renaming with tm.assert_produces_warning(FutureWarning, @@ -379,11 +378,15 @@ def test_cython_agg_boolean(self): def test_cython_agg_nothing_to_agg(self): frame = DataFrame({'a': np.random.randint(0, 5, 50), 'b': ['foo', 'bar'] * 25}) - pytest.raises(DataError, frame.groupby('a')['b'].mean) + with tm.assert_raises_regex(DataError, + "No numeric types to aggregate"): + frame.groupby('a')['b'].mean() frame = DataFrame({'a': np.random.randint(0, 5, 50), 'b': ['foo', 'bar'] * 25}) - pytest.raises(DataError, frame[['b']].groupby(frame['a']).mean) + with tm.assert_raises_regex(DataError, + "No numeric types to aggregate"): + frame[['b']].groupby(frame['a']).mean() def test_cython_agg_nothing_to_agg_with_dates(self): frame = DataFrame({'a': np.random.randint(0, 5, 50), @@ -557,7 +560,7 @@ def test_agg_dict_parameter_cast_result_dtypes(): df = DataFrame( {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], - 'time': date_range('1/1/2011', periods=8, freq='H')}) + 'time': date_range('1/1/2011', periods=8, freq='H')}) df.loc[[0, 1, 2, 5], 'time'] = None # test for `first` function @@ -765,7 +768,8 @@ def test_agg_nested_dicts(): g = df.groupby(['A', 'B']) - with pytest.raises(SpecificationError): + msg = "cannot perform renaming for r1 with a nested dictionary" + with tm.assert_raises_regex(SpecificationError, msg): g.aggregate({'r1': {'C': ['mean', 'sum']}, 'r2': {'D': ['mean', 'sum']}}) @@ -802,9 +806,9 @@ def test_agg_item_by_item_raise_typeerror(): def raiseException(df): pprint_thing('----------------------------------------') pprint_thing(df.to_string()) - raise TypeError + raise TypeError('test') - with pytest.raises(TypeError): + with tm.assert_raises_regex(TypeError, 'test'): df.groupby(0).agg(raiseException) From 91d3fa4ff54a9c0b91929127693fda6bcb3f2ff3 Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Sun, 24 Dec 2017 13:04:46 -0600 Subject: [PATCH 05/13] Add vscode to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index b1748ae72b8ba..0d4e8c6fb75a6 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ .ipynb_checkpoints .tags .cache/ +.vscode/ # Compiled source # ################### From 8521a8edff4c2dee2dd763fd54564c86348fd874 Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Sun, 24 Dec 2017 14:24:22 -0600 Subject: [PATCH 06/13] Fix failing test --- pandas/tests/groupby/test_aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 84f3f6afed739..9a5a355416423 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -768,7 +768,7 @@ def test_agg_nested_dicts(): g = df.groupby(['A', 'B']) - msg = "cannot perform renaming for r1 with a nested dictionary" + msg = r'cannot perform renaming for r[1-2] with a nested dictionary' with tm.assert_raises_regex(SpecificationError, msg): g.aggregate({'r1': {'C': ['mean', 'sum']}, 'r2': {'D': ['mean', 'sum']}}) From 93e7e380c7add48114a55b660c632d49fcc4b4cf Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Wed, 27 Dec 2017 10:27:58 -0600 Subject: [PATCH 07/13] Split tests_aggregate.py tests into multiple files --- pandas/tests/groupby/aggregate/__init__.py | 0 .../tests/groupby/aggregate/test_aggregate.py | 320 ++++++++++++ pandas/tests/groupby/aggregate/test_cython.py | 200 ++++++++ .../test_other.py} | 470 ------------------ 4 files changed, 520 insertions(+), 470 deletions(-) create mode 100644 pandas/tests/groupby/aggregate/__init__.py create mode 100644 pandas/tests/groupby/aggregate/test_aggregate.py create mode 100644 pandas/tests/groupby/aggregate/test_cython.py rename pandas/tests/groupby/{test_aggregate.py => aggregate/test_other.py} (51%) diff --git a/pandas/tests/groupby/aggregate/__init__.py b/pandas/tests/groupby/aggregate/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py new file mode 100644 index 0000000000000..e14a47ead939d --- /dev/null +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -0,0 +1,320 @@ +# -*- coding: utf-8 -*- + +""" +we test .agg behavior / note that .apply is tested +generally in test_groupby.py +""" + +from __future__ import print_function + +import pytest + +from datetime import datetime, timedelta +from functools import partial + +import numpy as np +from numpy import nan +import pandas as pd + +from pandas import (date_range, MultiIndex, DataFrame, + Series, Index, bdate_range, concat) +from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.core.groupby import SpecificationError, DataError +from pandas.compat import OrderedDict +from pandas.io.formats.printing import pprint_thing +import pandas.util.testing as tm + + +class TestGroupByAggregate(object): + + def setup_method(self, method): + self.ts = tm.makeTimeSeries() + + self.seriesd = tm.getSeriesData() + self.tsd = tm.getTimeSeriesData() + self.frame = DataFrame(self.seriesd) + self.tsframe = DataFrame(self.tsd) + + self.df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + self.df_mixed_floats = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.array( + np.random.randn(8), dtype='float32')}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.mframe = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self.three_group = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def test_agg_regression1(self): + grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + def test_agg_must_agg(self): + grouped = self.df.groupby('A')['C'] + + msg = "Must produce aggregated value" + with tm.assert_raises_regex(Exception, msg): + grouped.agg(lambda x: x.describe()) + with tm.assert_raises_regex(Exception, msg): + grouped.agg(lambda x: x.index[:2]) + + def test_agg_ser_multi_key(self): + # TODO(wesm): unused + ser = self.df.C # noqa + + f = lambda x: x.sum() + results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) + expected = self.df.groupby(['A', 'B']).sum()['C'] + assert_series_equal(results, expected) + + def test_agg_apply_corner(self): + # nothing to group, all NA + grouped = self.ts.groupby(self.ts * np.nan) + assert self.ts.dtype == np.float64 + + # groupby float64 values results in Float64Index + exp = Series([], + dtype=np.float64, + index=pd.Index([], dtype=np.float64)) + assert_series_equal(grouped.sum(), exp) + assert_series_equal(grouped.agg(np.sum), exp) + assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) + + # DataFrame + grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) + exp_df = DataFrame(columns=self.tsframe.columns, + dtype=float, + index=pd.Index([], dtype=np.float64)) + assert_frame_equal(grouped.sum(), exp_df, check_names=False) + assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) + assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], + check_names=False) + + def test_agg_grouping_is_list_tuple(self): + from pandas.core.groupby import Grouping + + df = tm.makeTimeDataFrame() + + grouped = df.groupby(lambda x: x.year) + grouper = grouped.grouper.groupings[0].grouper + grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_agg_python_multiindex(self): + grouped = self.mframe.groupby(['A', 'B']) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_aggregate_str_func(self): + def _check_results(grouped): + # single series + result = grouped['A'].agg('std') + expected = grouped['A'].std() + assert_series_equal(result, expected) + + # group frame by function name + result = grouped.aggregate('var') + expected = grouped.var() + assert_frame_equal(result, expected) + + # group frame by function dict + result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'], + ['C', 'mean'], ['D', 'sem']])) + expected = DataFrame(OrderedDict([['A', grouped['A'].var( + )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()], + ['D', grouped['D'].sem()]])) + assert_frame_equal(result, expected) + + by_weekday = self.tsframe.groupby(lambda x: x.weekday()) + _check_results(by_weekday) + + by_mwkday = self.tsframe.groupby([lambda x: x.month, + lambda x: x.weekday()]) + _check_results(by_mwkday) + + def test_aggregate_item_by_item(self): + + df = self.df.copy() + df['E'] = ['a'] * len(self.df) + grouped = self.df.groupby('A') + + # API change in 0.11 + # def aggfun(ser): + # return len(ser + 'a') + # result = grouped.agg(aggfun) + # assert len(result.columns) == 1 + + aggfun = lambda ser: ser.size + result = grouped.agg(aggfun) + foo = (self.df.A == 'foo').sum() + bar = (self.df.A == 'bar').sum() + K = len(result.columns) + + # GH5782 + # odd comparisons can result here, so cast to make easy + exp = pd.Series(np.array([foo] * K), index=list('BCD'), + dtype=np.float64, name='foo') + tm.assert_series_equal(result.xs('foo'), exp) + + exp = pd.Series(np.array([bar] * K), index=list('BCD'), + dtype=np.float64, name='bar') + tm.assert_almost_equal(result.xs('bar'), exp) + + def aggfun(ser): + return ser.size + + result = DataFrame().groupby(self.df.A).agg(aggfun) + assert isinstance(result, DataFrame) + assert len(result) == 0 + + def test_wrap_agg_out(self): + grouped = self.three_group.groupby(['A', 'B']) + + def func(ser): + if ser.dtype == np.object: + raise TypeError + else: + return ser.sum() + + result = grouped.aggregate(func) + exp_grouped = self.three_group.loc[:, self.three_group.columns != 'C'] + expected = exp_grouped.groupby(['A', 'B']).aggregate(func) + assert_frame_equal(result, expected) + + def test_agg_multiple_functions_maintain_order(self): + # GH #610 + funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] + result = self.df.groupby('A')['C'].agg(funcs) + exp_cols = Index(['mean', 'max', 'min']) + + tm.assert_index_equal(result.columns, exp_cols) + + def test_multiple_functions_tuples_and_non_tuples(self): + # #1359 + + funcs = [('foo', 'mean'), 'std'] + ex_funcs = [('foo', 'mean'), ('std', 'std')] + + result = self.df.groupby('A')['C'].agg(funcs) + expected = self.df.groupby('A')['C'].agg(ex_funcs) + assert_frame_equal(result, expected) + + result = self.df.groupby('A').agg(funcs) + expected = self.df.groupby('A').agg(ex_funcs) + assert_frame_equal(result, expected) + + def test_agg_multiple_functions_too_many_lambdas(self): + grouped = self.df.groupby('A') + funcs = ['mean', lambda x: x.mean(), lambda x: x.std()] + + msg = 'Function names must be unique, found multiple named ' + with tm.assert_raises_regex(SpecificationError, msg): + grouped.agg(funcs) + + def test_more_flexible_frame_multi_function(self): + + grouped = self.df.groupby('A') + + exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]])) + exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]])) + + expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1) + expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) + + d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]]) + result = grouped.aggregate(d) + + assert_frame_equal(result, expected) + + # be careful + result = grouped.aggregate(OrderedDict([['C', np.mean], + ['D', [np.mean, np.std]]])) + expected = grouped.aggregate(OrderedDict([['C', np.mean], + ['D', [np.mean, np.std]]])) + assert_frame_equal(result, expected) + + def foo(x): + return np.mean(x) + + def bar(x): + return np.std(x, ddof=1) + + # this uses column selection & renaming + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + d = OrderedDict([['C', np.mean], ['D', OrderedDict( + [['foo', np.mean], ['bar', np.std]])]]) + result = grouped.aggregate(d) + + d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) + expected = grouped.aggregate(d) + + assert_frame_equal(result, expected) + + def test_multi_function_flexible_mix(self): + # GH #1268 + grouped = self.df.groupby('A') + + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], + ['bar', 'std']])], ['D', 'sum']]) + + # this uses column selection & renaming + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = grouped.aggregate(d) + + d2 = OrderedDict([['C', OrderedDict([['foo', 'mean'], + ['bar', 'std']])], + ['D', ['sum']]]) + + # this uses column selection & renaming + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result2 = grouped.aggregate(d2) + + d3 = OrderedDict([['C', OrderedDict([['foo', 'mean'], + ['bar', 'std']])], + ['D', {'sum': 'sum'}]]) + + # this uses column selection & renaming + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + expected = grouped.aggregate(d3) + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py new file mode 100644 index 0000000000000..789e7be78e595 --- /dev/null +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -0,0 +1,200 @@ +# -*- coding: utf-8 -*- + +""" +we test .agg behavior / note that .apply is tested +generally in test_groupby.py +""" + +from __future__ import print_function + +import pytest + +from datetime import datetime, timedelta +from functools import partial + +import numpy as np +from numpy import nan +import pandas as pd + +from pandas import (date_range, MultiIndex, DataFrame, + Series, Index, bdate_range, concat) +from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.core.groupby import SpecificationError, DataError +from pandas.compat import OrderedDict +from pandas.io.formats.printing import pprint_thing +import pandas.util.testing as tm + + +class TestGroupByAggregateCython(object): + + def test_cythonized_aggers(self): + data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], + 'B': ['A', 'B'] * 6, + 'C': np.random.randn(12)} + df = DataFrame(data) + df.loc[2:10:2, 'C'] = nan + + def _testit(name): + + op = lambda x: getattr(x, name)() + + # single column + grouped = df.drop(['B'], axis=1).groupby('A') + exp = {} + for cat, group in grouped: + exp[cat] = op(group['C']) + exp = DataFrame({'C': exp}) + exp.index.name = 'A' + result = op(grouped) + assert_frame_equal(result, exp) + + # multiple columns + grouped = df.groupby(['A', 'B']) + expd = {} + for (cat1, cat2), group in grouped: + expd.setdefault(cat1, {})[cat2] = op(group['C']) + exp = DataFrame(expd).T.stack(dropna=False) + exp.index.names = ['A', 'B'] + exp.name = 'C' + + result = op(grouped)['C'] + if name in ['sum', 'prod']: + assert_series_equal(result, exp) + + _testit('count') + _testit('sum') + _testit('std') + _testit('var') + _testit('sem') + _testit('mean') + _testit('median') + _testit('prod') + _testit('min') + _testit('max') + + def test_cython_agg_boolean(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': np.random.randint(0, 2, 50).astype('bool')}) + result = frame.groupby('a')['b'].mean() + expected = frame.groupby('a')['b'].agg(np.mean) + + assert_series_equal(result, expected) + + def test_cython_agg_nothing_to_agg(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25}) + with tm.assert_raises_regex(DataError, + "No numeric types to aggregate"): + frame.groupby('a')['b'].mean() + + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25}) + with tm.assert_raises_regex(DataError, + "No numeric types to aggregate"): + frame[['b']].groupby(frame['a']).mean() + + def test_cython_agg_nothing_to_agg_with_dates(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25, + 'dates': pd.date_range('now', periods=50, + freq='T')}) + with tm.assert_raises_regex(DataError, + "No numeric types to aggregate"): + frame.groupby('b').dates.mean() + + def test_cython_agg_frame_columns(self): + # #2113 + df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) + + df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis='columns').mean() + + def test_cython_agg_return_dict(self): + # GH 16741 + df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + ts = df.groupby('A')['B'].agg( + lambda x: x.value_counts().to_dict()) + expected = Series([{'two': 1, 'one': 1, 'three': 1}, + {'two': 2, 'one': 2, 'three': 1}], + index=Index(['bar', 'foo'], name='A'), + name='B') + assert_series_equal(ts, expected) + + def test_cython_fail_agg(self): + dr = bdate_range('1/1/2000', periods=50) + ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) + + grouped = ts.groupby(lambda x: x.month) + summed = grouped.sum() + expected = grouped.agg(np.sum) + assert_series_equal(summed, expected) + + def test__cython_agg_general(self): + ops = [('mean', np.mean), + ('median', np.median), + ('var', np.var), + ('add', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), ] + df = DataFrame(np.random.randn(1000)) + labels = np.random.randint(0, 50, size=1000).astype(float) + + for op, targop in ops: + result = df.groupby(labels)._cython_agg_general(op) + expected = df.groupby(labels).agg(targop) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op, ) + raise + + @pytest.mark.parametrize('op, targop', [ + ('mean', np.mean), + ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), + ('var', lambda x: np.var(x, ddof=1)), + ('min', np.min), + ('max', np.max), ] + ) + def test_cython_agg_empty_buckets(self, op, targop): + df = pd.DataFrame([11, 12, 13]) + grps = range(0, 55, 5) + + # calling _cython_agg_general directly, instead of via the user API + # which sets different values for min_count, so do that here. + result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) + expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op,) + raise + + def test_cython_agg_empty_buckets_nanops(self): + # GH-18869 can't call nanops on empty groups, so hardcode expected + # for these + df = pd.DataFrame([11, 12, 13], columns=['a']) + grps = range(0, 25, 5) + # add / sum + result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add') + intervals = pd.interval_range(0, 20, freq=5) + expected = pd.DataFrame( + {"a": [0, 0, 36, 0]}, + index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + tm.assert_frame_equal(result, expected) + + # prod + result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod') + expected = pd.DataFrame( + {"a": [1, 1, 1716, 1]}, + index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/aggregate/test_other.py similarity index 51% rename from pandas/tests/groupby/test_aggregate.py rename to pandas/tests/groupby/aggregate/test_other.py index 9a5a355416423..42bd6cb0cca07 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -25,476 +25,6 @@ import pandas.util.testing as tm -class TestGroupByAggregate(object): - - def setup_method(self, method): - self.ts = tm.makeTimeSeries() - - self.seriesd = tm.getSeriesData() - self.tsd = tm.getTimeSeriesData() - self.frame = DataFrame(self.seriesd) - self.tsframe = DataFrame(self.tsd) - - self.df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - self.df_mixed_floats = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.array( - np.random.randn(8), dtype='float32')}) - - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.mframe = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) - - self.three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - def test_agg_regression1(self): - grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.agg(np.mean) - expected = grouped.mean() - assert_frame_equal(result, expected) - - def test_agg_must_agg(self): - grouped = self.df.groupby('A')['C'] - - msg = "Must produce aggregated value" - with tm.assert_raises_regex(Exception, msg): - grouped.agg(lambda x: x.describe()) - with tm.assert_raises_regex(Exception, msg): - grouped.agg(lambda x: x.index[:2]) - - def test_agg_ser_multi_key(self): - # TODO(wesm): unused - ser = self.df.C # noqa - - f = lambda x: x.sum() - results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) - expected = self.df.groupby(['A', 'B']).sum()['C'] - assert_series_equal(results, expected) - - def test_agg_apply_corner(self): - # nothing to group, all NA - grouped = self.ts.groupby(self.ts * np.nan) - assert self.ts.dtype == np.float64 - - # groupby float64 values results in Float64Index - exp = Series([], - dtype=np.float64, - index=pd.Index([], dtype=np.float64)) - assert_series_equal(grouped.sum(), exp) - assert_series_equal(grouped.agg(np.sum), exp) - assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) - - # DataFrame - grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) - exp_df = DataFrame(columns=self.tsframe.columns, - dtype=float, - index=pd.Index([], dtype=np.float64)) - assert_frame_equal(grouped.sum(), exp_df, check_names=False) - assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) - assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], - check_names=False) - - def test_agg_grouping_is_list_tuple(self): - from pandas.core.groupby import Grouping - - df = tm.makeTimeDataFrame() - - grouped = df.groupby(lambda x: x.year) - grouper = grouped.grouper.groupings[0].grouper - grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper)) - - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper)) - - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - def test_agg_python_multiindex(self): - grouped = self.mframe.groupby(['A', 'B']) - - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - def test_aggregate_str_func(self): - def _check_results(grouped): - # single series - result = grouped['A'].agg('std') - expected = grouped['A'].std() - assert_series_equal(result, expected) - - # group frame by function name - result = grouped.aggregate('var') - expected = grouped.var() - assert_frame_equal(result, expected) - - # group frame by function dict - result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'], - ['C', 'mean'], ['D', 'sem']])) - expected = DataFrame(OrderedDict([['A', grouped['A'].var( - )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()], - ['D', grouped['D'].sem()]])) - assert_frame_equal(result, expected) - - by_weekday = self.tsframe.groupby(lambda x: x.weekday()) - _check_results(by_weekday) - - by_mwkday = self.tsframe.groupby([lambda x: x.month, - lambda x: x.weekday()]) - _check_results(by_mwkday) - - def test_aggregate_item_by_item(self): - - df = self.df.copy() - df['E'] = ['a'] * len(self.df) - grouped = self.df.groupby('A') - - # API change in 0.11 - # def aggfun(ser): - # return len(ser + 'a') - # result = grouped.agg(aggfun) - # assert len(result.columns) == 1 - - aggfun = lambda ser: ser.size - result = grouped.agg(aggfun) - foo = (self.df.A == 'foo').sum() - bar = (self.df.A == 'bar').sum() - K = len(result.columns) - - # GH5782 - # odd comparisons can result here, so cast to make easy - exp = pd.Series(np.array([foo] * K), index=list('BCD'), - dtype=np.float64, name='foo') - tm.assert_series_equal(result.xs('foo'), exp) - - exp = pd.Series(np.array([bar] * K), index=list('BCD'), - dtype=np.float64, name='bar') - tm.assert_almost_equal(result.xs('bar'), exp) - - def aggfun(ser): - return ser.size - - result = DataFrame().groupby(self.df.A).agg(aggfun) - assert isinstance(result, DataFrame) - assert len(result) == 0 - - def test_wrap_agg_out(self): - grouped = self.three_group.groupby(['A', 'B']) - - def func(ser): - if ser.dtype == np.object: - raise TypeError - else: - return ser.sum() - - result = grouped.aggregate(func) - exp_grouped = self.three_group.loc[:, self.three_group.columns != 'C'] - expected = exp_grouped.groupby(['A', 'B']).aggregate(func) - assert_frame_equal(result, expected) - - def test_agg_multiple_functions_maintain_order(self): - # GH #610 - funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] - result = self.df.groupby('A')['C'].agg(funcs) - exp_cols = Index(['mean', 'max', 'min']) - - tm.assert_index_equal(result.columns, exp_cols) - - def test_multiple_functions_tuples_and_non_tuples(self): - # #1359 - - funcs = [('foo', 'mean'), 'std'] - ex_funcs = [('foo', 'mean'), ('std', 'std')] - - result = self.df.groupby('A')['C'].agg(funcs) - expected = self.df.groupby('A')['C'].agg(ex_funcs) - assert_frame_equal(result, expected) - - result = self.df.groupby('A').agg(funcs) - expected = self.df.groupby('A').agg(ex_funcs) - assert_frame_equal(result, expected) - - def test_agg_multiple_functions_too_many_lambdas(self): - grouped = self.df.groupby('A') - funcs = ['mean', lambda x: x.mean(), lambda x: x.std()] - - msg = 'Function names must be unique, found multiple named ' - with tm.assert_raises_regex(SpecificationError, msg): - grouped.agg(funcs) - - def test_more_flexible_frame_multi_function(self): - - grouped = self.df.groupby('A') - - exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]])) - exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]])) - - expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1) - expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) - - d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]]) - result = grouped.aggregate(d) - - assert_frame_equal(result, expected) - - # be careful - result = grouped.aggregate(OrderedDict([['C', np.mean], - ['D', [np.mean, np.std]]])) - expected = grouped.aggregate(OrderedDict([['C', np.mean], - ['D', [np.mean, np.std]]])) - assert_frame_equal(result, expected) - - def foo(x): - return np.mean(x) - - def bar(x): - return np.std(x, ddof=1) - - # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - d = OrderedDict([['C', np.mean], ['D', OrderedDict( - [['foo', np.mean], ['bar', np.std]])]]) - result = grouped.aggregate(d) - - d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) - expected = grouped.aggregate(d) - - assert_frame_equal(result, expected) - - def test_multi_function_flexible_mix(self): - # GH #1268 - grouped = self.df.groupby('A') - - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], - ['bar', 'std']])], ['D', 'sum']]) - - # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = grouped.aggregate(d) - - d2 = OrderedDict([['C', OrderedDict([['foo', 'mean'], - ['bar', 'std']])], - ['D', ['sum']]]) - - # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result2 = grouped.aggregate(d2) - - d3 = OrderedDict([['C', OrderedDict([['foo', 'mean'], - ['bar', 'std']])], - ['D', {'sum': 'sum'}]]) - - # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - expected = grouped.aggregate(d3) - - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) - - -class TestGroupByAggregateCython(object): - - def test_cythonized_aggers(self): - data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], - 'B': ['A', 'B'] * 6, - 'C': np.random.randn(12)} - df = DataFrame(data) - df.loc[2:10:2, 'C'] = nan - - def _testit(name): - - op = lambda x: getattr(x, name)() - - # single column - grouped = df.drop(['B'], axis=1).groupby('A') - exp = {} - for cat, group in grouped: - exp[cat] = op(group['C']) - exp = DataFrame({'C': exp}) - exp.index.name = 'A' - result = op(grouped) - assert_frame_equal(result, exp) - - # multiple columns - grouped = df.groupby(['A', 'B']) - expd = {} - for (cat1, cat2), group in grouped: - expd.setdefault(cat1, {})[cat2] = op(group['C']) - exp = DataFrame(expd).T.stack(dropna=False) - exp.index.names = ['A', 'B'] - exp.name = 'C' - - result = op(grouped)['C'] - if name in ['sum', 'prod']: - assert_series_equal(result, exp) - - _testit('count') - _testit('sum') - _testit('std') - _testit('var') - _testit('sem') - _testit('mean') - _testit('median') - _testit('prod') - _testit('min') - _testit('max') - - def test_cython_agg_boolean(self): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': np.random.randint(0, 2, 50).astype('bool')}) - result = frame.groupby('a')['b'].mean() - expected = frame.groupby('a')['b'].agg(np.mean) - - assert_series_equal(result, expected) - - def test_cython_agg_nothing_to_agg(self): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25}) - with tm.assert_raises_regex(DataError, - "No numeric types to aggregate"): - frame.groupby('a')['b'].mean() - - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25}) - with tm.assert_raises_regex(DataError, - "No numeric types to aggregate"): - frame[['b']].groupby(frame['a']).mean() - - def test_cython_agg_nothing_to_agg_with_dates(self): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25, - 'dates': pd.date_range('now', periods=50, - freq='T')}) - with tm.assert_raises_regex(DataError, - "No numeric types to aggregate"): - frame.groupby('b').dates.mean() - - def test_cython_agg_frame_columns(self): - # #2113 - df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) - - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - - def test_cython_agg_return_dict(self): - # GH 16741 - df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - ts = df.groupby('A')['B'].agg( - lambda x: x.value_counts().to_dict()) - expected = Series([{'two': 1, 'one': 1, 'three': 1}, - {'two': 2, 'one': 2, 'three': 1}], - index=Index(['bar', 'foo'], name='A'), - name='B') - assert_series_equal(ts, expected) - - def test_cython_fail_agg(self): - dr = bdate_range('1/1/2000', periods=50) - ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) - - grouped = ts.groupby(lambda x: x.month) - summed = grouped.sum() - expected = grouped.agg(np.sum) - assert_series_equal(summed, expected) - - def test__cython_agg_general(self): - ops = [('mean', np.mean), - ('median', np.median), - ('var', np.var), - ('add', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), ] - df = DataFrame(np.random.randn(1000)) - labels = np.random.randint(0, 50, size=1000).astype(float) - - for op, targop in ops: - result = df.groupby(labels)._cython_agg_general(op) - expected = df.groupby(labels).agg(targop) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op, ) - raise - - @pytest.mark.parametrize('op, targop', [ - ('mean', np.mean), - ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), - ('var', lambda x: np.var(x, ddof=1)), - ('min', np.min), - ('max', np.max), ] - ) - def test_cython_agg_empty_buckets(self, op, targop): - df = pd.DataFrame([11, 12, 13]) - grps = range(0, 55, 5) - - # calling _cython_agg_general directly, instead of via the user API - # which sets different values for min_count, so do that here. - result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) - expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op,) - raise - - def test_cython_agg_empty_buckets_nanops(self): - # GH-18869 can't call nanops on empty groups, so hardcode expected - # for these - df = pd.DataFrame([11, 12, 13], columns=['a']) - grps = range(0, 25, 5) - # add / sum - result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add') - intervals = pd.interval_range(0, 20, freq=5) - expected = pd.DataFrame( - {"a": [0, 0, 36, 0]}, - index=pd.CategoricalIndex(intervals, name='a', ordered=True)) - tm.assert_frame_equal(result, expected) - - # prod - result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod') - expected = pd.DataFrame( - {"a": [1, 1, 1716, 1]}, - index=pd.CategoricalIndex(intervals, name='a', ordered=True)) - tm.assert_frame_equal(result, expected) - - def test_agg_api(): # GH 6337 # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error From 86b6bf9a4174503554f2e8b1f96953499beb107f Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Wed, 27 Dec 2017 10:32:29 -0600 Subject: [PATCH 08/13] Clean up imports --- pandas/tests/groupby/aggregate/test_aggregate.py | 14 ++------------ pandas/tests/groupby/aggregate/test_cython.py | 10 ++-------- pandas/tests/groupby/aggregate/test_other.py | 7 ++----- 3 files changed, 6 insertions(+), 25 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e14a47ead939d..5c38d8ebcb512 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -5,23 +5,13 @@ generally in test_groupby.py """ -from __future__ import print_function - -import pytest - -from datetime import datetime, timedelta -from functools import partial - import numpy as np -from numpy import nan import pandas as pd -from pandas import (date_range, MultiIndex, DataFrame, - Series, Index, bdate_range, concat) +from pandas import concat, DataFrame, Index, MultiIndex, Series from pandas.util.testing import assert_frame_equal, assert_series_equal -from pandas.core.groupby import SpecificationError, DataError +from pandas.core.groupby import SpecificationError from pandas.compat import OrderedDict -from pandas.io.formats.printing import pprint_thing import pandas.util.testing as tm diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 789e7be78e595..14107949db900 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -9,19 +9,13 @@ import pytest -from datetime import datetime, timedelta -from functools import partial - import numpy as np from numpy import nan import pandas as pd -from pandas import (date_range, MultiIndex, DataFrame, - Series, Index, bdate_range, concat) +from pandas import bdate_range, DataFrame, Index, Series from pandas.util.testing import assert_frame_equal, assert_series_equal -from pandas.core.groupby import SpecificationError, DataError -from pandas.compat import OrderedDict -from pandas.io.formats.printing import pprint_thing +from pandas.core.groupby import DataError import pandas.util.testing as tm diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 42bd6cb0cca07..8463da0b66472 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -13,14 +13,11 @@ from functools import partial import numpy as np -from numpy import nan import pandas as pd -from pandas import (date_range, MultiIndex, DataFrame, - Series, Index, bdate_range, concat) +from pandas import date_range, DataFrame, Index, MultiIndex, Series from pandas.util.testing import assert_frame_equal, assert_series_equal -from pandas.core.groupby import SpecificationError, DataError -from pandas.compat import OrderedDict +from pandas.core.groupby import SpecificationError from pandas.io.formats.printing import pprint_thing import pandas.util.testing as tm From 4ee4b6b401293f6f0dc213180adc76e6540e03cf Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Wed, 27 Dec 2017 10:37:06 -0600 Subject: [PATCH 09/13] Change all assert_frame/assert_series to tm.assert_ --- .../tests/groupby/aggregate/test_aggregate.py | 39 ++++++------ pandas/tests/groupby/aggregate/test_cython.py | 11 ++-- pandas/tests/groupby/aggregate/test_other.py | 63 +++++++++---------- 3 files changed, 55 insertions(+), 58 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 5c38d8ebcb512..6a598c3de55c9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,7 +9,6 @@ import pandas as pd from pandas import concat, DataFrame, Index, MultiIndex, Series -from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas.core.groupby import SpecificationError from pandas.compat import OrderedDict import pandas.util.testing as tm @@ -61,7 +60,7 @@ def test_agg_regression1(self): grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.agg(np.mean) expected = grouped.mean() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_agg_must_agg(self): grouped = self.df.groupby('A')['C'] @@ -79,7 +78,7 @@ def test_agg_ser_multi_key(self): f = lambda x: x.sum() results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) expected = self.df.groupby(['A', 'B']).sum()['C'] - assert_series_equal(results, expected) + tm.assert_series_equal(results, expected) def test_agg_apply_corner(self): # nothing to group, all NA @@ -90,18 +89,18 @@ def test_agg_apply_corner(self): exp = Series([], dtype=np.float64, index=pd.Index([], dtype=np.float64)) - assert_series_equal(grouped.sum(), exp) - assert_series_equal(grouped.agg(np.sum), exp) - assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) + tm.assert_series_equal(grouped.sum(), exp) + tm.assert_series_equal(grouped.agg(np.sum), exp) + tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) # DataFrame grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64)) - assert_frame_equal(grouped.sum(), exp_df, check_names=False) - assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) - assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], + tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False) + tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) + tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False) def test_agg_grouping_is_list_tuple(self): @@ -135,12 +134,12 @@ def _check_results(grouped): # single series result = grouped['A'].agg('std') expected = grouped['A'].std() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # group frame by function name result = grouped.aggregate('var') expected = grouped.var() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # group frame by function dict result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'], @@ -148,7 +147,7 @@ def _check_results(grouped): expected = DataFrame(OrderedDict([['A', grouped['A'].var( )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()], ['D', grouped['D'].sem()]])) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) by_weekday = self.tsframe.groupby(lambda x: x.weekday()) _check_results(by_weekday) @@ -204,7 +203,7 @@ def func(ser): result = grouped.aggregate(func) exp_grouped = self.three_group.loc[:, self.three_group.columns != 'C'] expected = exp_grouped.groupby(['A', 'B']).aggregate(func) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_agg_multiple_functions_maintain_order(self): # GH #610 @@ -222,11 +221,11 @@ def test_multiple_functions_tuples_and_non_tuples(self): result = self.df.groupby('A')['C'].agg(funcs) expected = self.df.groupby('A')['C'].agg(ex_funcs) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = self.df.groupby('A').agg(funcs) expected = self.df.groupby('A').agg(ex_funcs) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_agg_multiple_functions_too_many_lambdas(self): grouped = self.df.groupby('A') @@ -249,14 +248,14 @@ def test_more_flexible_frame_multi_function(self): d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]]) result = grouped.aggregate(d) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # be careful result = grouped.aggregate(OrderedDict([['C', np.mean], ['D', [np.mean, np.std]]])) expected = grouped.aggregate(OrderedDict([['C', np.mean], ['D', [np.mean, np.std]]])) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def foo(x): return np.mean(x) @@ -274,7 +273,7 @@ def bar(x): d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) expected = grouped.aggregate(d) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_multi_function_flexible_mix(self): # GH #1268 @@ -306,5 +305,5 @@ def test_multi_function_flexible_mix(self): check_stacklevel=False): expected = grouped.aggregate(d3) - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 14107949db900..c0308baa148f2 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -14,7 +14,6 @@ import pandas as pd from pandas import bdate_range, DataFrame, Index, Series -from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas.core.groupby import DataError import pandas.util.testing as tm @@ -40,7 +39,7 @@ def _testit(name): exp = DataFrame({'C': exp}) exp.index.name = 'A' result = op(grouped) - assert_frame_equal(result, exp) + tm.assert_frame_equal(result, exp) # multiple columns grouped = df.groupby(['A', 'B']) @@ -53,7 +52,7 @@ def _testit(name): result = op(grouped)['C'] if name in ['sum', 'prod']: - assert_series_equal(result, exp) + tm.assert_series_equal(result, exp) _testit('count') _testit('sum') @@ -72,7 +71,7 @@ def test_cython_agg_boolean(self): result = frame.groupby('a')['b'].mean() expected = frame.groupby('a')['b'].agg(np.mean) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_cython_agg_nothing_to_agg(self): frame = DataFrame({'a': np.random.randint(0, 5, 50), @@ -119,7 +118,7 @@ def test_cython_agg_return_dict(self): {'two': 2, 'one': 2, 'three': 1}], index=Index(['bar', 'foo'], name='A'), name='B') - assert_series_equal(ts, expected) + tm.assert_series_equal(ts, expected) def test_cython_fail_agg(self): dr = bdate_range('1/1/2000', periods=50) @@ -128,7 +127,7 @@ def test_cython_fail_agg(self): grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() expected = grouped.agg(np.sum) - assert_series_equal(summed, expected) + tm.assert_series_equal(summed, expected) def test__cython_agg_general(self): ops = [('mean', np.mean), diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 8463da0b66472..75b73846b19a1 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -16,7 +16,6 @@ import pandas as pd from pandas import date_range, DataFrame, Index, MultiIndex, Series -from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas.core.groupby import SpecificationError from pandas.io.formats.printing import pprint_thing import pandas.util.testing as tm @@ -39,7 +38,7 @@ def peak_to_peak(arr): expected = grouped.agg([peak_to_peak]) expected.columns = ['data1', 'data2'] result = grouped.agg(peak_to_peak) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_agg_datetimes_mixed(): @@ -93,32 +92,32 @@ def test_agg_dict_parameter_cast_result_dtypes(): # test for `first` function exp = df.loc[[0, 3, 4, 6]].set_index('class') grouped = df.groupby('class') - assert_frame_equal(grouped.first(), exp) - assert_frame_equal(grouped.agg('first'), exp) - assert_frame_equal(grouped.agg({'time': 'first'}), exp) - assert_series_equal(grouped.time.first(), exp['time']) - assert_series_equal(grouped.time.agg('first'), exp['time']) + tm.assert_frame_equal(grouped.first(), exp) + tm.assert_frame_equal(grouped.agg('first'), exp) + tm.assert_frame_equal(grouped.agg({'time': 'first'}), exp) + tm.assert_series_equal(grouped.time.first(), exp['time']) + tm.assert_series_equal(grouped.time.agg('first'), exp['time']) # test for `last` function exp = df.loc[[0, 3, 4, 7]].set_index('class') grouped = df.groupby('class') - assert_frame_equal(grouped.last(), exp) - assert_frame_equal(grouped.agg('last'), exp) - assert_frame_equal(grouped.agg({'time': 'last'}), exp) - assert_series_equal(grouped.time.last(), exp['time']) - assert_series_equal(grouped.time.agg('last'), exp['time']) + tm.assert_frame_equal(grouped.last(), exp) + tm.assert_frame_equal(grouped.agg('last'), exp) + tm.assert_frame_equal(grouped.agg({'time': 'last'}), exp) + tm.assert_series_equal(grouped.time.last(), exp['time']) + tm.assert_series_equal(grouped.time.agg('last'), exp['time']) # count exp = pd.Series([2, 2, 2, 2], index=Index(list('ABCD'), name='class'), name='time') - assert_series_equal(grouped.time.agg(len), exp) - assert_series_equal(grouped.time.size(), exp) + tm.assert_series_equal(grouped.time.agg(len), exp) + tm.assert_series_equal(grouped.time.size(), exp) exp = pd.Series([0, 1, 1, 2], index=Index(list('ABCD'), name='class'), name='time') - assert_series_equal(grouped.time.count(), exp) + tm.assert_series_equal(grouped.time.count(), exp) def test_agg_cast_results_dtypes(): @@ -130,7 +129,7 @@ def test_agg_cast_results_dtypes(): result = df.groupby('X')['Y'].agg(len) expected = df.groupby('X')['Y'].count() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_aggregate_float64_no_int64(): @@ -177,7 +176,7 @@ def test_aggregate_api_consistency(): expected = pd.concat([d_sum, d_mean], axis=1) expected.columns = ['sum', 'mean'] - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg([np.sum, np.mean]) expected = pd.concat([c_sum, @@ -187,7 +186,7 @@ def test_aggregate_api_consistency(): axis=1) expected.columns = MultiIndex.from_product([['C', 'D'], ['sum', 'mean']]) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) result = grouped[['D', 'C']].agg([np.sum, np.mean]) expected = pd.concat([d_sum, @@ -197,13 +196,13 @@ def test_aggregate_api_consistency(): axis=1) expected.columns = MultiIndex.from_product([['D', 'C'], ['sum', 'mean']]) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg({'C': 'mean', 'D': 'sum'}) expected = pd.concat([d_sum, c_mean], axis=1) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg({'C': ['mean', 'sum'], 'D': ['mean', 'sum']}) @@ -226,7 +225,7 @@ def test_aggregate_api_consistency(): axis=1) expected.columns = MultiIndex.from_product([['r', 'r2'], ['D', 'C']]) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) def test_agg_dict_renaming_deprecation(): @@ -270,7 +269,7 @@ def test_agg_compat(): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = g['D'].agg({'C': ['sum', 'std']}) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) expected = pd.concat([g['D'].sum(), g['D'].std()], @@ -280,7 +279,7 @@ def test_agg_compat(): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = g['D'].agg({'C': 'sum', 'D': 'std'}) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) def test_agg_nested_dicts(): @@ -310,7 +309,7 @@ def test_agg_nested_dicts(): expected.columns = pd.MultiIndex.from_tuples( [('ra', 'mean'), ('ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) # same name as the original column # GH9052 @@ -322,7 +321,7 @@ def test_agg_nested_dicts(): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = g['D'].agg({'D': np.sum, 'result2': np.mean}) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) def test_agg_item_by_item_raise_typeerror(): @@ -345,7 +344,7 @@ def test_series_agg_multikey(): result = grouped.agg(np.sum) expected = grouped.sum() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_series_agg_multi_pure_python(): @@ -366,7 +365,7 @@ def bad(x): result = data.groupby(['A', 'B']).agg(bad) expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_agg_consistency(): @@ -391,7 +390,7 @@ def P1(a): expected.columns = expected.columns.levels[0] result = g.agg(P1) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_agg_callables(): @@ -409,7 +408,7 @@ def __call__(self, x): expected = df.groupby("foo").agg(sum) for ecall in equiv_callables: result = df.groupby('foo').agg(ecall) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_agg_over_numpy_arrays(): @@ -427,7 +426,7 @@ def test_agg_over_numpy_arrays(): index=expected_index, columns=expected_column) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_agg_timezone_round_trip(): @@ -493,7 +492,7 @@ def test_agg_structs_dataframe(structure, expected): result = df.groupby(['A', 'B']).aggregate(structure) expected.index.names = ['A', 'B'] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("structure, expected", [ @@ -512,7 +511,7 @@ def test_agg_structs_series(structure, expected): result = df.groupby('A')['C'].aggregate(structure) expected.index.name = 'A' - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) @pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.") From ee353941a2d9544d2807a6830a42df091c58daa3 Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Wed, 27 Dec 2017 11:01:52 -0600 Subject: [PATCH 10/13] Boyscouting -- cleaning up code --- .../tests/groupby/aggregate/test_aggregate.py | 80 ++++++------- pandas/tests/groupby/aggregate/test_cython.py | 18 ++- pandas/tests/groupby/aggregate/test_other.py | 109 +++++++----------- 3 files changed, 88 insertions(+), 119 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 6a598c3de55c9..35973974da136 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1,8 +1,7 @@ # -*- coding: utf-8 -*- """ -we test .agg behavior / note that .apply is tested -generally in test_groupby.py +test .agg behavior / note that .apply is tested generally in test_groupby.py """ import numpy as np @@ -34,11 +33,10 @@ def setup_method(self, method): {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), - 'D': np.array( - np.random.randn(8), dtype='float32')}) + 'D': np.array(np.random.randn(8), dtype='float32')}) - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) @@ -86,22 +84,21 @@ def test_agg_apply_corner(self): assert self.ts.dtype == np.float64 # groupby float64 values results in Float64Index - exp = Series([], - dtype=np.float64, + exp = Series([], dtype=np.float64, index=pd.Index([], dtype=np.float64)) tm.assert_series_equal(grouped.sum(), exp) tm.assert_series_equal(grouped.agg(np.sum), exp) - tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) + tm.assert_series_equal(grouped.apply(np.sum), exp, + check_index_type=False) # DataFrame grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) - exp_df = DataFrame(columns=self.tsframe.columns, - dtype=float, + exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64)) tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False) tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], - check_names=False) + check_names=False) def test_agg_grouping_is_list_tuple(self): from pandas.core.groupby import Grouping @@ -142,11 +139,14 @@ def _check_results(grouped): tm.assert_frame_equal(result, expected) # group frame by function dict - result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'], - ['C', 'mean'], ['D', 'sem']])) - expected = DataFrame(OrderedDict([['A', grouped['A'].var( - )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()], - ['D', grouped['D'].sem()]])) + result = grouped.agg(OrderedDict([['A', 'var'], + ['B', 'std'], + ['C', 'mean'], + ['D', 'sem']])) + expected = DataFrame(OrderedDict([['A', grouped['A'].var()], + ['B', grouped['B'].std()], + ['C', grouped['C'].mean()], + ['D', grouped['D'].sem()]])) tm.assert_frame_equal(result, expected) by_weekday = self.tsframe.groupby(lambda x: x.weekday()) @@ -264,10 +264,10 @@ def bar(x): return np.std(x, ddof=1) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - d = OrderedDict([['C', np.mean], ['D', OrderedDict( - [['foo', np.mean], ['bar', np.std]])]]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + d = OrderedDict([['C', np.mean], + ['D', OrderedDict([['foo', np.mean], + ['bar', np.std]])]]) result = grouped.aggregate(d) d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) @@ -279,31 +279,25 @@ def test_multi_function_flexible_mix(self): # GH #1268 grouped = self.df.groupby('A') - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], - ['bar', 'std']])], ['D', 'sum']]) - + # Expected + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], + ['D', {'sum': 'sum'}]]) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = grouped.aggregate(d) - - d2 = OrderedDict([['C', OrderedDict([['foo', 'mean'], - ['bar', 'std']])], - ['D', ['sum']]]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + expected = grouped.aggregate(d) + # Test 1 + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], + ['D', 'sum']]) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result2 = grouped.aggregate(d2) - - d3 = OrderedDict([['C', OrderedDict([['foo', 'mean'], - ['bar', 'std']])], - ['D', {'sum': 'sum'}]]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = grouped.aggregate(d) + tm.assert_frame_equal(result, expected) + # Test 2 + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], + ['D', ['sum']]]) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - expected = grouped.aggregate(d3) - + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = grouped.aggregate(d) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result2, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index c0308baa148f2..dc95db2901f09 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -1,8 +1,7 @@ # -*- coding: utf-8 -*- """ -we test .agg behavior / note that .apply is tested -generally in test_groupby.py +test cython .agg behavior """ from __future__ import print_function @@ -76,14 +75,14 @@ def test_cython_agg_boolean(self): def test_cython_agg_nothing_to_agg(self): frame = DataFrame({'a': np.random.randint(0, 5, 50), 'b': ['foo', 'bar'] * 25}) - with tm.assert_raises_regex(DataError, - "No numeric types to aggregate"): + msg = "No numeric types to aggregate" + + with tm.assert_raises_regex(DataError, msg): frame.groupby('a')['b'].mean() frame = DataFrame({'a': np.random.randint(0, 5, 50), 'b': ['foo', 'bar'] * 25}) - with tm.assert_raises_regex(DataError, - "No numeric types to aggregate"): + with tm.assert_raises_regex(DataError, msg): frame[['b']].groupby(frame['a']).mean() def test_cython_agg_nothing_to_agg_with_dates(self): @@ -91,8 +90,8 @@ def test_cython_agg_nothing_to_agg_with_dates(self): 'b': ['foo', 'bar'] * 25, 'dates': pd.date_range('now', periods=50, freq='T')}) - with tm.assert_raises_regex(DataError, - "No numeric types to aggregate"): + msg = "No numeric types to aggregate" + with tm.assert_raises_regex(DataError, msg): frame.groupby('b').dates.mean() def test_cython_agg_frame_columns(self): @@ -112,8 +111,7 @@ def test_cython_agg_return_dict(self): 'C': np.random.randn(8), 'D': np.random.randn(8)}) - ts = df.groupby('A')['B'].agg( - lambda x: x.value_counts().to_dict()) + ts = df.groupby('A')['B'].agg(lambda x: x.value_counts().to_dict()) expected = Series([{'two': 1, 'one': 1, 'three': 1}, {'two': 2, 'one': 2, 'three': 1}], index=Index(['bar', 'foo'], name='A'), diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 75b73846b19a1..92b8999a7bdc7 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -1,8 +1,7 @@ # -*- coding: utf-8 -*- """ -we test .agg behavior / note that .apply is tested -generally in test_groupby.py +test all other .agg behavior """ from __future__ import print_function @@ -42,14 +41,18 @@ def peak_to_peak(arr): def test_agg_datetimes_mixed(): - data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]] + data = [[1, '2012-01-01', 1.0], + [2, '2012-01-02', 2.0], + [3, None, 3.0]] df1 = DataFrame({'key': [x[0] for x in data], 'date': [x[1] for x in data], 'value': [x[2] for x in data]}) - data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] - else None, row[2]] for row in data] + data = [[row[0], + datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] else None, + row[2]] + for row in data] df2 = DataFrame({'key': [x[0] for x in data], 'date': [x[1] for x in data], @@ -84,9 +87,8 @@ def test_agg_period_index(): def test_agg_dict_parameter_cast_result_dtypes(): # GH 12821 - df = DataFrame( - {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], - 'time': date_range('1/1/2011', periods=8, freq='H')}) + df = DataFrame({'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], + 'time': date_range('1/1/2011', periods=8, freq='H')}) df.loc[[0, 1, 2, 5], 'time'] = None # test for `first` function @@ -138,15 +140,13 @@ def test_aggregate_float64_no_int64(): "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]}) - expected = DataFrame({"a": [1, 2.5, 4, 5]}, - index=[1, 2, 4, 5]) + expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) expected.index.name = "b" result = df.groupby("b")[["a"]].mean() tm.assert_frame_equal(result, expected) - expected = DataFrame({"a": [1, 2.5, 4, 5], - "c": [1, 2.5, 4, 5]}, + expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) expected.index.name = "b" @@ -173,56 +173,36 @@ def test_aggregate_api_consistency(): d_sum = grouped['D'].sum() result = grouped['D'].agg(['sum', 'mean']) - expected = pd.concat([d_sum, d_mean], - axis=1) + expected = pd.concat([d_sum, d_mean], axis=1) expected.columns = ['sum', 'mean'] tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg([np.sum, np.mean]) - expected = pd.concat([c_sum, - c_mean, - d_sum, - d_mean], - axis=1) + expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) expected.columns = MultiIndex.from_product([['C', 'D'], ['sum', 'mean']]) tm.assert_frame_equal(result, expected, check_like=True) result = grouped[['D', 'C']].agg([np.sum, np.mean]) - expected = pd.concat([d_sum, - d_mean, - c_sum, - c_mean], - axis=1) + expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) expected.columns = MultiIndex.from_product([['D', 'C'], ['sum', 'mean']]) tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg({'C': 'mean', 'D': 'sum'}) - expected = pd.concat([d_sum, - c_mean], - axis=1) + expected = pd.concat([d_sum, c_mean], axis=1) tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg({'C': ['mean', 'sum'], 'D': ['mean', 'sum']}) - expected = pd.concat([c_mean, - c_sum, - d_mean, - d_sum], - axis=1) + expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) expected.columns = MultiIndex.from_product([['C', 'D'], ['mean', 'sum']]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = grouped[['D', 'C']].agg({'r': np.sum, 'r2': np.mean}) - expected = pd.concat([d_sum, - c_sum, - d_mean, - c_mean], - axis=1) + expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) expected.columns = MultiIndex.from_product([['r', 'r2'], ['D', 'C']]) tm.assert_frame_equal(result, expected, check_like=True) @@ -240,8 +220,7 @@ def test_agg_dict_renaming_deprecation(): 'C': {'bar': ['count', 'min']}}) assert "using a dict with renaming" in str(w[0].message) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df.groupby('A')[['B', 'C']].agg({'ma': 'max'}) with tm.assert_produces_warning(FutureWarning) as w: @@ -261,23 +240,17 @@ def test_agg_compat(): g = df.groupby(['A', 'B']) - expected = pd.concat([g['D'].sum(), - g['D'].std()], - axis=1) + expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = MultiIndex.from_tuples([('C', 'sum'), ('C', 'std')]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = g['D'].agg({'C': ['sum', 'std']}) tm.assert_frame_equal(result, expected, check_like=True) - expected = pd.concat([g['D'].sum(), - g['D'].std()], - axis=1) + expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = ['C', 'D'] - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = g['D'].agg({'C': 'sum', 'D': 'std'}) tm.assert_frame_equal(result, expected, check_like=True) @@ -299,8 +272,7 @@ def test_agg_nested_dicts(): g.aggregate({'r1': {'C': ['mean', 'sum']}, 'r2': {'D': ['mean', 'sum']}}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = g.agg({'C': {'ra': ['mean', 'std']}, 'D': {'rb': ['mean', 'std']}}) expected = pd.concat([g['C'].mean(), g['C'].std(), @@ -313,13 +285,11 @@ def test_agg_nested_dicts(): # same name as the original column # GH9052 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) expected = expected.rename(columns={'result1': 'D'}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = g['D'].agg({'D': np.sum, 'result2': np.mean}) tm.assert_frame_equal(result, expected, check_like=True) @@ -402,8 +372,12 @@ class fn_class(object): def __call__(self, x): return sum(x) - equiv_callables = [sum, np.sum, lambda x: sum(x), lambda x: x.sum(), - partial(sum), fn_class()] + equiv_callables = [sum, + np.sum, + lambda x: sum(x), + lambda x: x.sum(), + partial(sum), + fn_class(), ] expected = df.groupby("foo").agg(sum) for ecall in equiv_callables: @@ -432,8 +406,8 @@ def test_agg_over_numpy_arrays(): def test_agg_timezone_round_trip(): # GH 15426 ts = pd.Timestamp("2016-01-01 12:00:00", tz='US/Pacific') - df = pd.DataFrame({'a': 1, 'b': [ts + timedelta(minutes=nn) - for nn in range(10)]}) + df = pd.DataFrame({'a': 1, + 'b': [ts + timedelta(minutes=nn) for nn in range(10)]}) result1 = df.groupby('a')['b'].agg(np.min).iloc[0] result2 = df.groupby('a')['b'].agg(lambda x: np.min(x)).iloc[0] @@ -463,14 +437,17 @@ def test_sum_uint64_overflow(): # see gh-14758 # Convert to uint64 and don't overflow - df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], - dtype=object) + 9223372036854775807 + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) + df = df + 9223372036854775807 - index = pd.Index([9223372036854775808, 9223372036854775810, - 9223372036854775812], dtype=np.uint64) + index = pd.Index([9223372036854775808, + 9223372036854775810, + 9223372036854775812], + dtype=np.uint64) expected = pd.DataFrame({1: [9223372036854775809, 9223372036854775811, - 9223372036854775813]}, index=index) + 9223372036854775813]}, + index=index) expected.index.name = 0 result = df.groupby(0).sum() From d48a149ffb6a3d0ad2d25e0198fbac8af3fce2a8 Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Wed, 27 Dec 2017 14:55:09 -0600 Subject: [PATCH 11/13] Parametrize tests and remove commented out code --- .../tests/groupby/aggregate/test_aggregate.py | 21 +++---- pandas/tests/groupby/aggregate/test_cython.py | 62 ++++++++++--------- 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 35973974da136..53a2d1d1a9685 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -4,6 +4,8 @@ test .agg behavior / note that .apply is tested generally in test_groupby.py """ +import pytest + import numpy as np import pandas as pd @@ -126,7 +128,11 @@ def test_agg_python_multiindex(self): expected = grouped.mean() tm.assert_frame_equal(result, expected) - def test_aggregate_str_func(self): + @pytest.mark.parametrize('groupbyfunc', [ + lambda x: x.weekday(), + [lambda x: x.month, lambda x: x.weekday()], + ]) + def test_aggregate_str_func(self, groupbyfunc): def _check_results(grouped): # single series result = grouped['A'].agg('std') @@ -149,12 +155,7 @@ def _check_results(grouped): ['D', grouped['D'].sem()]])) tm.assert_frame_equal(result, expected) - by_weekday = self.tsframe.groupby(lambda x: x.weekday()) - _check_results(by_weekday) - - by_mwkday = self.tsframe.groupby([lambda x: x.month, - lambda x: x.weekday()]) - _check_results(by_mwkday) + _check_results(self.tsframe.groupby(groupbyfunc)) def test_aggregate_item_by_item(self): @@ -162,12 +163,6 @@ def test_aggregate_item_by_item(self): df['E'] = ['a'] * len(self.df) grouped = self.df.groupby('A') - # API change in 0.11 - # def aggfun(ser): - # return len(ser + 'a') - # result = grouped.agg(aggfun) - # assert len(result.columns) == 1 - aggfun = lambda ser: ser.size result = grouped.agg(aggfun) foo = (self.df.A == 'foo').sum() diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index dc95db2901f09..7e499fbd201e6 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -19,7 +19,19 @@ class TestGroupByAggregateCython(object): - def test_cythonized_aggers(self): + @pytest.mark.parametrize('op', [ + 'count', + 'sum', + 'std', + 'var', + 'sem', + 'mean', + 'median', + 'prod', + 'min', + 'max', + ]) + def test_cythonized_aggers(self, op): data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], 'B': ['A', 'B'] * 6, 'C': np.random.randn(12)} @@ -53,16 +65,7 @@ def _testit(name): if name in ['sum', 'prod']: tm.assert_series_equal(result, exp) - _testit('count') - _testit('sum') - _testit('std') - _testit('var') - _testit('sem') - _testit('mean') - _testit('median') - _testit('prod') - _testit('min') - _testit('max') + _testit(op) def test_cython_agg_boolean(self): frame = DataFrame({'a': np.random.randint(0, 5, 50), @@ -127,27 +130,28 @@ def test_cython_fail_agg(self): expected = grouped.agg(np.sum) tm.assert_series_equal(summed, expected) - def test__cython_agg_general(self): - ops = [('mean', np.mean), - ('median', np.median), - ('var', np.var), - ('add', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), ] + @pytest.mark.parametrize('op, targop', [ + ('mean', np.mean), + ('median', np.median), + ('var', np.var), + ('add', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), + ]) + def test__cython_agg_general(self, op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) - for op, targop in ops: - result = df.groupby(labels)._cython_agg_general(op) - expected = df.groupby(labels).agg(targop) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op, ) - raise + result = df.groupby(labels)._cython_agg_general(op) + expected = df.groupby(labels).agg(targop) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op, ) + raise @pytest.mark.parametrize('op, targop', [ ('mean', np.mean), From 1add9dd2bef4c589f024adc7a88591d327efab26 Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Wed, 27 Dec 2017 15:23:50 -0600 Subject: [PATCH 12/13] Delete cython class, all tests are grouped by module --- pandas/tests/groupby/aggregate/test_cython.py | 354 +++++++++--------- 1 file changed, 180 insertions(+), 174 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 7e499fbd201e6..186f31a6ca212 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -17,179 +17,185 @@ import pandas.util.testing as tm -class TestGroupByAggregateCython(object): - - @pytest.mark.parametrize('op', [ - 'count', - 'sum', - 'std', - 'var', - 'sem', - 'mean', - 'median', - 'prod', - 'min', - 'max', - ]) - def test_cythonized_aggers(self, op): - data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], - 'B': ['A', 'B'] * 6, - 'C': np.random.randn(12)} - df = DataFrame(data) - df.loc[2:10:2, 'C'] = nan - - def _testit(name): - - op = lambda x: getattr(x, name)() - - # single column - grouped = df.drop(['B'], axis=1).groupby('A') - exp = {} - for cat, group in grouped: - exp[cat] = op(group['C']) - exp = DataFrame({'C': exp}) - exp.index.name = 'A' - result = op(grouped) - tm.assert_frame_equal(result, exp) - - # multiple columns - grouped = df.groupby(['A', 'B']) - expd = {} - for (cat1, cat2), group in grouped: - expd.setdefault(cat1, {})[cat2] = op(group['C']) - exp = DataFrame(expd).T.stack(dropna=False) - exp.index.names = ['A', 'B'] - exp.name = 'C' - - result = op(grouped)['C'] - if name in ['sum', 'prod']: - tm.assert_series_equal(result, exp) - - _testit(op) - - def test_cython_agg_boolean(self): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': np.random.randint(0, 2, 50).astype('bool')}) - result = frame.groupby('a')['b'].mean() - expected = frame.groupby('a')['b'].agg(np.mean) - - tm.assert_series_equal(result, expected) - - def test_cython_agg_nothing_to_agg(self): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25}) - msg = "No numeric types to aggregate" - - with tm.assert_raises_regex(DataError, msg): - frame.groupby('a')['b'].mean() - - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25}) - with tm.assert_raises_regex(DataError, msg): - frame[['b']].groupby(frame['a']).mean() - - def test_cython_agg_nothing_to_agg_with_dates(self): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25, - 'dates': pd.date_range('now', periods=50, - freq='T')}) - msg = "No numeric types to aggregate" - with tm.assert_raises_regex(DataError, msg): - frame.groupby('b').dates.mean() - - def test_cython_agg_frame_columns(self): - # #2113 - df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) - - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - - def test_cython_agg_return_dict(self): - # GH 16741 - df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - ts = df.groupby('A')['B'].agg(lambda x: x.value_counts().to_dict()) - expected = Series([{'two': 1, 'one': 1, 'three': 1}, - {'two': 2, 'one': 2, 'three': 1}], - index=Index(['bar', 'foo'], name='A'), - name='B') - tm.assert_series_equal(ts, expected) - - def test_cython_fail_agg(self): - dr = bdate_range('1/1/2000', periods=50) - ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) - - grouped = ts.groupby(lambda x: x.month) - summed = grouped.sum() - expected = grouped.agg(np.sum) - tm.assert_series_equal(summed, expected) - - @pytest.mark.parametrize('op, targop', [ - ('mean', np.mean), - ('median', np.median), - ('var', np.var), - ('add', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), - ]) - def test__cython_agg_general(self, op, targop): - df = DataFrame(np.random.randn(1000)) - labels = np.random.randint(0, 50, size=1000).astype(float) - - result = df.groupby(labels)._cython_agg_general(op) - expected = df.groupby(labels).agg(targop) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op, ) - raise - - @pytest.mark.parametrize('op, targop', [ - ('mean', np.mean), - ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), - ('var', lambda x: np.var(x, ddof=1)), - ('min', np.min), - ('max', np.max), ] - ) - def test_cython_agg_empty_buckets(self, op, targop): - df = pd.DataFrame([11, 12, 13]) - grps = range(0, 55, 5) - - # calling _cython_agg_general directly, instead of via the user API - # which sets different values for min_count, so do that here. - result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) - expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op,) - raise - - def test_cython_agg_empty_buckets_nanops(self): - # GH-18869 can't call nanops on empty groups, so hardcode expected - # for these - df = pd.DataFrame([11, 12, 13], columns=['a']) - grps = range(0, 25, 5) - # add / sum - result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add') - intervals = pd.interval_range(0, 20, freq=5) - expected = pd.DataFrame( - {"a": [0, 0, 36, 0]}, - index=pd.CategoricalIndex(intervals, name='a', ordered=True)) +@pytest.mark.parametrize('op', [ + 'count', + 'sum', + 'std', + 'var', + 'sem', + 'mean', + 'median', + 'prod', + 'min', + 'max', +]) +def test_cythonized_aggers(op): + data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], + 'B': ['A', 'B'] * 6, + 'C': np.random.randn(12)} + df = DataFrame(data) + df.loc[2:10:2, 'C'] = nan + + def _testit(name): + + op = lambda x: getattr(x, name)() + + # single column + grouped = df.drop(['B'], axis=1).groupby('A') + exp = {} + for cat, group in grouped: + exp[cat] = op(group['C']) + exp = DataFrame({'C': exp}) + exp.index.name = 'A' + result = op(grouped) + tm.assert_frame_equal(result, exp) + + # multiple columns + grouped = df.groupby(['A', 'B']) + expd = {} + for (cat1, cat2), group in grouped: + expd.setdefault(cat1, {})[cat2] = op(group['C']) + exp = DataFrame(expd).T.stack(dropna=False) + exp.index.names = ['A', 'B'] + exp.name = 'C' + + result = op(grouped)['C'] + if name in ['sum', 'prod']: + tm.assert_series_equal(result, exp) + + _testit(op) + + +def test_cython_agg_boolean(): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': np.random.randint(0, 2, 50).astype('bool')}) + result = frame.groupby('a')['b'].mean() + expected = frame.groupby('a')['b'].agg(np.mean) + + tm.assert_series_equal(result, expected) + + +def test_cython_agg_nothing_to_agg(): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25}) + msg = "No numeric types to aggregate" + + with tm.assert_raises_regex(DataError, msg): + frame.groupby('a')['b'].mean() + + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25}) + with tm.assert_raises_regex(DataError, msg): + frame[['b']].groupby(frame['a']).mean() + + +def test_cython_agg_nothing_to_agg_with_dates(): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25, + 'dates': pd.date_range('now', periods=50, freq='T')}) + msg = "No numeric types to aggregate" + with tm.assert_raises_regex(DataError, msg): + frame.groupby('b').dates.mean() + + +def test_cython_agg_frame_columns(): + # #2113 + df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) + + df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis='columns').mean() + + +def test_cython_agg_return_dict(): + # GH 16741 + df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + ts = df.groupby('A')['B'].agg(lambda x: x.value_counts().to_dict()) + expected = Series([{'two': 1, 'one': 1, 'three': 1}, + {'two': 2, 'one': 2, 'three': 1}], + index=Index(['bar', 'foo'], name='A'), + name='B') + tm.assert_series_equal(ts, expected) + + +def test_cython_fail_agg(): + dr = bdate_range('1/1/2000', periods=50) + ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) + + grouped = ts.groupby(lambda x: x.month) + summed = grouped.sum() + expected = grouped.agg(np.sum) + tm.assert_series_equal(summed, expected) + + +@pytest.mark.parametrize('op, targop', [ + ('mean', np.mean), + ('median', np.median), + ('var', np.var), + ('add', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), +]) +def test__cython_agg_general(op, targop): + df = DataFrame(np.random.randn(1000)) + labels = np.random.randint(0, 50, size=1000).astype(float) + + result = df.groupby(labels)._cython_agg_general(op) + expected = df.groupby(labels).agg(targop) + try: tm.assert_frame_equal(result, expected) - - # prod - result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod') - expected = pd.DataFrame( - {"a": [1, 1, 1716, 1]}, - index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + except BaseException as exc: + exc.args += ('operation: %s' % op, ) + raise + + +@pytest.mark.parametrize('op, targop', [ + ('mean', np.mean), + ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), + ('var', lambda x: np.var(x, ddof=1)), + ('min', np.min), + ('max', np.max), ] +) +def test_cython_agg_empty_buckets(op, targop): + df = pd.DataFrame([11, 12, 13]) + grps = range(0, 55, 5) + + # calling _cython_agg_general directly, instead of via the user API + # which sets different values for min_count, so do that here. + result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) + expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) + try: tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op,) + raise + + +def test_cython_agg_empty_buckets_nanops(): + # GH-18869 can't call nanops on empty groups, so hardcode expected + # for these + df = pd.DataFrame([11, 12, 13], columns=['a']) + grps = range(0, 25, 5) + # add / sum + result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add') + intervals = pd.interval_range(0, 20, freq=5) + expected = pd.DataFrame( + {"a": [0, 0, 36, 0]}, + index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + tm.assert_frame_equal(result, expected) + + # prod + result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod') + expected = pd.DataFrame( + {"a": [1, 1, 1716, 1]}, + index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + tm.assert_frame_equal(result, expected) From 5d763ffbff8c283147d6bfc0d58e70e483e021c5 Mon Sep 17 00:00:00 2001 From: Aly Sivji Date: Wed, 27 Dec 2017 16:27:11 -0600 Subject: [PATCH 13/13] Collapse functions and remove try except blocks --- .../tests/groupby/aggregate/test_aggregate.py | 48 +++++++------- pandas/tests/groupby/aggregate/test_cython.py | 62 ++++++++----------- pandas/tests/groupby/aggregate/test_other.py | 7 +-- 3 files changed, 48 insertions(+), 69 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 53a2d1d1a9685..caf2365a54ec8 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -133,32 +133,30 @@ def test_agg_python_multiindex(self): [lambda x: x.month, lambda x: x.weekday()], ]) def test_aggregate_str_func(self, groupbyfunc): - def _check_results(grouped): - # single series - result = grouped['A'].agg('std') - expected = grouped['A'].std() - tm.assert_series_equal(result, expected) - - # group frame by function name - result = grouped.aggregate('var') - expected = grouped.var() - tm.assert_frame_equal(result, expected) - - # group frame by function dict - result = grouped.agg(OrderedDict([['A', 'var'], - ['B', 'std'], - ['C', 'mean'], - ['D', 'sem']])) - expected = DataFrame(OrderedDict([['A', grouped['A'].var()], - ['B', grouped['B'].std()], - ['C', grouped['C'].mean()], - ['D', grouped['D'].sem()]])) - tm.assert_frame_equal(result, expected) - - _check_results(self.tsframe.groupby(groupbyfunc)) + grouped = self.tsframe.groupby(groupbyfunc) - def test_aggregate_item_by_item(self): + # single series + result = grouped['A'].agg('std') + expected = grouped['A'].std() + tm.assert_series_equal(result, expected) + + # group frame by function name + result = grouped.aggregate('var') + expected = grouped.var() + tm.assert_frame_equal(result, expected) + + # group frame by function dict + result = grouped.agg(OrderedDict([['A', 'var'], + ['B', 'std'], + ['C', 'mean'], + ['D', 'sem']])) + expected = DataFrame(OrderedDict([['A', grouped['A'].var()], + ['B', grouped['B'].std()], + ['C', grouped['C'].mean()], + ['D', grouped['D'].sem()]])) + tm.assert_frame_equal(result, expected) + def test_aggregate_item_by_item(self): df = self.df.copy() df['E'] = ['a'] * len(self.df) grouped = self.df.groupby('A') @@ -210,7 +208,6 @@ def test_agg_multiple_functions_maintain_order(self): def test_multiple_functions_tuples_and_non_tuples(self): # #1359 - funcs = [('foo', 'mean'), 'std'] ex_funcs = [('foo', 'mean'), ('std', 'std')] @@ -231,7 +228,6 @@ def test_agg_multiple_functions_too_many_lambdas(self): grouped.agg(funcs) def test_more_flexible_frame_multi_function(self): - grouped = self.df.groupby('A') exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]])) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 186f31a6ca212..c8ee05ddbb74f 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -17,7 +17,7 @@ import pandas.util.testing as tm -@pytest.mark.parametrize('op', [ +@pytest.mark.parametrize('op_name', [ 'count', 'sum', 'std', @@ -29,41 +29,37 @@ 'min', 'max', ]) -def test_cythonized_aggers(op): +def test_cythonized_aggers(op_name): data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], 'B': ['A', 'B'] * 6, 'C': np.random.randn(12)} df = DataFrame(data) df.loc[2:10:2, 'C'] = nan - def _testit(name): + op = lambda x: getattr(x, op_name)() - op = lambda x: getattr(x, name)() + # single column + grouped = df.drop(['B'], axis=1).groupby('A') + exp = {} + for cat, group in grouped: + exp[cat] = op(group['C']) + exp = DataFrame({'C': exp}) + exp.index.name = 'A' + result = op(grouped) + tm.assert_frame_equal(result, exp) - # single column - grouped = df.drop(['B'], axis=1).groupby('A') - exp = {} - for cat, group in grouped: - exp[cat] = op(group['C']) - exp = DataFrame({'C': exp}) - exp.index.name = 'A' - result = op(grouped) - tm.assert_frame_equal(result, exp) + # multiple columns + grouped = df.groupby(['A', 'B']) + expd = {} + for (cat1, cat2), group in grouped: + expd.setdefault(cat1, {})[cat2] = op(group['C']) + exp = DataFrame(expd).T.stack(dropna=False) + exp.index.names = ['A', 'B'] + exp.name = 'C' - # multiple columns - grouped = df.groupby(['A', 'B']) - expd = {} - for (cat1, cat2), group in grouped: - expd.setdefault(cat1, {})[cat2] = op(group['C']) - exp = DataFrame(expd).T.stack(dropna=False) - exp.index.names = ['A', 'B'] - exp.name = 'C' - - result = op(grouped)['C'] - if name in ['sum', 'prod']: - tm.assert_series_equal(result, exp) - - _testit(op) + result = op(grouped)['C'] + if op_name in ['sum', 'prod']: + tm.assert_series_equal(result, exp) def test_cython_agg_boolean(): @@ -151,11 +147,7 @@ def test__cython_agg_general(op, targop): result = df.groupby(labels)._cython_agg_general(op) expected = df.groupby(labels).agg(targop) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op, ) - raise + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('op, targop', [ @@ -173,11 +165,7 @@ def test_cython_agg_empty_buckets(op, targop): # which sets different values for min_count, so do that here. result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op,) - raise + tm.assert_frame_equal(result, expected) def test_cython_agg_empty_buckets_nanops(): diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 92b8999a7bdc7..f8e44b1548819 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -158,7 +158,6 @@ def test_aggregate_api_consistency(): # GH 9052 # make sure that the aggregates via dict # are consistent - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', @@ -230,7 +229,6 @@ def test_agg_dict_renaming_deprecation(): def test_agg_compat(): # GH 12334 - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', @@ -257,7 +255,6 @@ def test_agg_compat(): def test_agg_nested_dicts(): # API change for disallowing these types of nested dicts - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', @@ -341,7 +338,6 @@ def bad(x): def test_agg_consistency(): # agg with ([]) and () not consistent # GH 6715 - def P1(a): try: return np.percentile(a.dropna(), q=1) @@ -435,7 +431,6 @@ def test_agg_timezone_round_trip(): def test_sum_uint64_overflow(): # see gh-14758 - # Convert to uint64 and don't overflow df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) df = df + 9223372036854775807 @@ -492,7 +487,7 @@ def test_agg_structs_series(structure, expected): @pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.") -def test_agg_category_nansum(self): +def test_agg_category_nansum(): categories = ['a', 'b', 'c'] df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], categories=categories),