From de81cc1c124e268c73c082fb3cd8892dbb4d83be Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 3 Mar 2012 18:37:18 -0500 Subject: [PATCH] BUG: fix int32 overflow when computing group_index, GH #850 --- pandas/core/groupby.py | 2 +- pandas/tests/test_groupby.py | 9 +++++++++ pandas/tools/plotting.py | 25 ++++++++++++++++--------- setup.py | 4 ++-- 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 8cd0c38eec246..9fe0810aeb324 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1472,7 +1472,7 @@ def get_group_index(label_list, shape): mask = np.zeros(n, dtype=bool) for i in xrange(len(shape)): stride = np.prod([x for x in shape[i+1:]], dtype=int) - group_index += label_list[i] * stride + group_index += com._ensure_int64(label_list[i]) * stride mask |= label_list[i] < 0 np.putmask(group_index, mask, -1) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 355a8cf94aef2..153b8c5f7bb85 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1457,6 +1457,15 @@ def test_groupby_2d_malformed(self): self.assert_(np.array_equal(tmp.columns, ['zeros', 'ones'])) self.assert_(np.array_equal(tmp.values, res_values)) + def test_int32_overflow(self): + B = np.concatenate((np.arange(100000), np.arange(100000), + np.arange(50000))) + A = np.arange(250000) + df = DataFrame({'A' : A, 'B' : B, 'C' : np.random.randn(250000)}) + + left = df.groupby(['A', 'B']).sum() + right = df.groupby(['B', 'A']).sum() + self.assert_(len(left) == len(right)) def test_decons(): from pandas.core.groupby import decons_group_index, get_group_index diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index f6bc33ee15624..3e81643ee3490 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -22,21 +22,24 @@ def hist(data, column, by=None, ax=None, fontsize=None): return ax def grouped_hist(data, column=None, by=None, ax=None, bins=50, log=False, - figsize=None, layout=None): + figsize=None, layout=None, sharex=False, sharey=False, + rot=90): """ Returns ------- fig : matplotlib.Figure """ - if isinstance(data, DataFrame): - data = data[column] + # if isinstance(data, DataFrame): + # data = data[column] def plot_group(group, ax): ax.hist(group.dropna(), bins=bins) - fig, axes = _grouped_plot(plot_group, data, by=by, sharex=False, - sharey=False, figsize=figsize, - layout=layout) + + fig, axes = _grouped_plot(plot_group, data, column=column, + by=by, sharex=sharex, sharey=sharey, + figsize=figsize, layout=layout, + rot=rot) fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.3, wspace=0.2) return fig @@ -45,7 +48,7 @@ def plot_group(group, ax): def boxplot(data, column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None): """ - Make a box plot from DataFrame column optionally grouped by some columns or + Make a box plot from DataFrame column optionally grouped b ysome columns or other inputs Parameters @@ -142,8 +145,9 @@ def plot_group(group, ax): return fig -def _grouped_plot(plotf, data, by=None, numeric_only=True, figsize=None, - sharex=True, sharey=True, layout=None): +def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, + figsize=None, sharex=True, sharey=True, layout=None, + rot=0): import matplotlib.pyplot as plt # allow to specify mpl default with 'default' @@ -151,6 +155,9 @@ def _grouped_plot(plotf, data, by=None, numeric_only=True, figsize=None, figsize = (10, 5) # our default grouped = data.groupby(by) + if column is not None: + grouped = grouped[column] + ngroups = len(grouped) nrows, ncols = layout or _get_layout(ngroups) diff --git a/setup.py b/setup.py index d27f264160e71..2868b765d49dc 100755 --- a/setup.py +++ b/setup.py @@ -164,8 +164,8 @@ MAJOR = 0 MINOR = 7 -MICRO = 1 -ISRELEASED = True +MICRO = 2 +ISRELEASED = False VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) QUALIFIER = ''