From 852a99459c2a7d47fc7fb0574cdad8359a42fade Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Sep 2012 13:21:43 -0400 Subject: [PATCH] API: default empty DataFrame to dtype=object to prevent certain class of TypeError, e.g. out of empty SQL query. closes #1783 --- RELEASE.rst | 6 ++++++ pandas/core/frame.py | 7 ++++++- pandas/core/groupby.py | 19 ++++++++++++++----- pandas/core/internals.py | 7 +++++-- pandas/tests/test_frame.py | 16 +++++++++++----- pandas/tests/test_groupby.py | 21 +++++++++++++-------- 6 files changed, 55 insertions(+), 21 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 20835bb346965..a92f78d0a9a08 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -64,6 +64,10 @@ pandas 0.9.0 transposed. Legacy files will still be readable by HDFStore (#1834, #1824) - Legacy cruft removed: pandas.stats.misc.quantileTS - Use ISO8601 format for Period repr: monthly, daily, and on down (#1776) + - Empty DataFrame columns are now created as object dtype. This will prevent + a class of TypeErrors that was occurring in code where the dtype of a + column would depend on the presence of data or not (e.g. a SQL query having + results) (#1783) **Bug fixes** @@ -184,6 +188,8 @@ pandas 0.9.0 datetime.tzinfo without .zone and ._utcoffset attributes (#1922) - Fix DataFrame formatting of small, non-zero FP numbers (#1911) - Various fixes by upcasting of date -> datetime (#1395) + - Raise better exception when passing multiple functions with the same name, + such as lambdas, to GroupBy.aggregate pandas 0.8.1 ============ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a0d7cc2ab044c..bcfe645d5f14c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4994,7 +4994,12 @@ def _homogenize(data, index, columns, dtype=None): if dtype is not None and issubclass(dtype.type, np.integer): continue - v = np.empty(len(index), dtype=dtype) + if dtype is None: + # #1783 + v = np.empty(len(index), dtype=object) + else: + v = np.empty(len(index), dtype=dtype) + v.fill(nan) else: v = data[k] diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0227fcd189e8e..ae3a0847600e5 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -18,6 +18,11 @@ class GroupByError(Exception): pass +class DataError(GroupByError): + pass + +class SpecificationError(GroupByError): + pass def _groupby_function(name, alias, npfunc): def f(self): @@ -290,7 +295,7 @@ def mean(self): """ try: return self._cython_agg_general('mean') - except GroupByError: + except DataError: raise except Exception: # pragma: no cover f = lambda x: x.mean(axis=self.axis) @@ -304,7 +309,7 @@ def median(self): """ try: return self._cython_agg_general('median') - except GroupByError: + except DataError: raise except Exception: # pragma: no cover f = lambda x: x.median(axis=self.axis) @@ -375,7 +380,7 @@ def _cython_agg_general(self, how): output[name] = result if len(output) == 0: - raise GroupByError('No numeric types to aggregate') + raise DataError('No numeric types to aggregate') return self._wrap_aggregated_output(output, names) @@ -1270,6 +1275,10 @@ def _aggregate_multiple_funcs(self, arg): results = {} for name, func in arg: + if name in results: + raise SpecificationError('Function names must be unique, ' + 'found multiple named %s' % name) + results[name] = self.aggregate(func) return DataFrame(results, columns=columns) @@ -1415,7 +1424,7 @@ def _cython_agg_blocks(self, how): new_blocks.append(newb) if len(new_blocks) == 0: - raise GroupByError('No numeric types to aggregate') + raise DataError('No numeric types to aggregate') return new_blocks @@ -1542,7 +1551,7 @@ def _aggregate_multiple_funcs(self, arg): grouper=self.grouper) results.append(colg.aggregate(arg)) keys.append(col) - except (TypeError, GroupByError): + except (TypeError, DataError): pass result = concat(results, keys=keys, axis=1) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e33d98cdab837..804150869d680 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -951,7 +951,7 @@ def reindex_axis(self, new_axis, method=None, axis=0, copy=True): result.axes[axis] = new_axis if axis == 0: - # patch ref_items + # patch ref_items, #1823 for blk in result.blocks: blk.ref_items = new_axis @@ -1290,7 +1290,10 @@ def form_blocks(data, axes): if len(extra_items): shape = (len(extra_items),) + tuple(len(x) for x in axes[1:]) - block_values = np.empty(shape, dtype=float) + + # empty items -> dtype object + block_values = np.empty(shape, dtype=object) + block_values.fill(nan) na_block = make_block(block_values, extra_items, items, diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 5ac3fae414147..12fd35ecad02f 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1593,12 +1593,12 @@ def test_constructor_dict(self): tm.assert_dict_equal(self.ts2, frame['col2'], compare_keys=False) frame = DataFrame({'col1' : self.ts1, - 'col2' : self.ts2}, + 'col2' : self.ts2}, columns=['col2', 'col3', 'col4']) self.assertEqual(len(frame), len(self.ts2)) self.assert_('col1' not in frame) - self.assert_(np.isnan(frame['col3']).all()) + self.assert_(isnull(frame['col3']).all()) # Corner cases self.assertEqual(len(DataFrame({})), 0) @@ -1888,7 +1888,11 @@ def test_constructor_corner(self): # does not error but ends up float df = DataFrame(index=range(10), columns=['a','b'], dtype=int) - self.assert_(df.values.dtype == np.float64) + self.assert_(df.values.dtype == np.object_) + + # #1783 empty dtype object + df = DataFrame({}, columns=['foo', 'bar']) + self.assert_(df.values.dtype == np.object_) def test_constructor_scalar_inference(self): data = {'int' : 1, 'bool' : True, @@ -3305,7 +3309,9 @@ def test_to_csv_multiindex(self): recons = DataFrame.from_csv(path) exp = tsframe[:0] exp.index = [] - assert_frame_equal(recons, exp) + + self.assert_(recons.columns.equals(exp.columns)) + self.assert_(len(recons) == 0) def test_to_csv_float32_nanrep(self): df = DataFrame(np.random.randn(1, 4).astype(np.float32)) @@ -6632,7 +6638,7 @@ def test_boolean_indexing(self): def test_sum_bools(self): df = DataFrame(index=range(1), columns=range(10)) - bools = np.isnan(df) + bools = isnull(df) self.assert_(bools.sum(axis=1)[0] == 10) def test_fillna_col_reordering(self): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index ad0b76c45e4b9..b3f3bd95f9c54 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -8,7 +8,7 @@ from pandas.core.index import Index, MultiIndex from pandas.core.common import rands from pandas.core.api import Categorical, DataFrame -from pandas.core.groupby import GroupByError +from pandas.core.groupby import GroupByError, SpecificationError, DataError from pandas.core.series import Series from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal) @@ -252,11 +252,10 @@ def test_agg_apply_corner(self): # DataFrame grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) - assert_frame_equal(grouped.sum(), - DataFrame(columns=self.tsframe.columns)) - assert_frame_equal(grouped.agg(np.sum), - DataFrame(columns=self.tsframe.columns)) - assert_frame_equal(grouped.apply(np.sum), DataFrame({})) + exp_df = DataFrame(columns=self.tsframe.columns, dtype=float) + assert_frame_equal(grouped.sum(), exp_df) + assert_frame_equal(grouped.agg(np.sum), exp_df) + assert_frame_equal(grouped.apply(np.sum), DataFrame({}, dtype=float)) def test_agg_grouping_is_list_tuple(self): from pandas.core.groupby import Grouping @@ -1078,11 +1077,11 @@ def test_cython_agg_boolean(self): def test_cython_agg_nothing_to_agg(self): frame = DataFrame({'a': np.random.randint(0, 5, 50), 'b': ['foo', 'bar'] * 25}) - self.assertRaises(GroupByError, frame.groupby('a')['b'].mean) + self.assertRaises(DataError, frame.groupby('a')['b'].mean) frame = DataFrame({'a': np.random.randint(0, 5, 50), 'b': ['foo', 'bar'] * 25}) - self.assertRaises(GroupByError, frame[['b']].groupby(frame['a']).mean) + self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean) def test_wrap_aggregated_output_multindex(self): df = self.mframe.T @@ -1847,6 +1846,12 @@ def test_multiple_functions_tuples_and_non_tuples(self): expected = self.df.groupby('A').agg(ex_funcs) assert_frame_equal(result, expected) + def test_agg_multiple_functions_too_many_lambdas(self): + grouped = self.df.groupby('A') + funcs = ['mean', lambda x: x.mean(), lambda x: x.std()] + + self.assertRaises(SpecificationError, grouped.agg, funcs) + def test_more_flexible_frame_multi_function(self): from pandas import concat