From 41dc3f13134fc74502eb38239d1ce5eb7f205e29 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Thu, 26 Dec 2019 20:39:36 +0800 Subject: [PATCH 1/6] TST: test for GroupBy.quantile with integer columns and arraylike q (GH30289) --- pandas/tests/groupby/test_function.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 4ca23c61ba920..a2423374a4e8f 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1398,6 +1398,22 @@ def test_quantile_array_multiple_levels(): tm.assert_frame_equal(result, expected) +def test_groupby_quantile_with_arraylike_q_and_int_columns(): + # GH30289 + df = pd.DataFrame(np.array([2 * [_ % 4] for _ in range(10)]), columns=[0, 1]) + + quantiles = [0.5, 0.6] + expected_index = pd.MultiIndex.from_product( + [[0, 1, 2, 3], [0.5, 0.6]], names=[0, None] + ) + + expected_values = [float(x) for x in [0, 0, 1, 1, 2, 2, 3, 3]] + expected = pd.DataFrame(expected_values, index=expected_index, columns=[1]) + result = df.groupby(0).quantile(quantiles) + + tm.assert_frame_equal(result, expected) + + def test_quantile_raises(): df = pd.DataFrame( [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] From cdcf3ea14c563aa08f520ed8fbc456c11a8579c8 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Thu, 26 Dec 2019 20:43:31 +0800 Subject: [PATCH 2/6] BUG: GroupBy.quantile with integer columns and arraylike q (GH30289) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/groupby/groupby.py | 23 ++++++++++------------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 9023cf2ab1b4f..6ef43142fa386 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -848,6 +848,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) - Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index (:issue:`30208`) - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) +- Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 529d123d256e8..d8b5a9ed67959 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1937,21 +1937,18 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :] # but this hits https://github.com/pandas-dev/pandas/issues/10710 # which doesn't reorder the list-like `q` on the inner level. - order = np.roll(list(range(result.index.nlevels)), -1) + order = np.append(np.arange(1, result.index.nlevels), 0) + # temporarily saves the index names + index_names = np.array(result.index.names) + # set index names to positions to avoid confusion + result.index.names = np.arange(len(index_names)) + # place quantiles on the inside result = result.reorder_levels(order) - result = result.reindex(q, level=-1) + # restore the index names in order + result.index.names = index_names[order] - # fix order. - hi = len(q) * self.ngroups - arr = np.arange(0, hi, self.ngroups) - arrays = [] - - for i in range(self.ngroups): - arr2 = arr + i - arrays.append(arr2) - - indices = np.concatenate(arrays) - assert len(indices) == len(result) + # reorder rows to keep things sorted + indices = np.arange(len(result)).reshape([len(q), self.ngroups]).T.flatten() return result.take(indices) @Substitution(name="groupby") From 9503bbf0129d9f2694b4dc8854549146e77c5d07 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Thu, 26 Dec 2019 21:46:05 +0800 Subject: [PATCH 3/6] CLN: cleaned up previous commit with slight modification (GH30289) --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/groupby/groupby.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6ef43142fa386..ac4c612f7b019 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -848,7 +848,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) - Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index (:issue:`30208`) - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) -- Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names +- Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d8b5a9ed67959..227547daf3668 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1937,13 +1937,17 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :] # but this hits https://github.com/pandas-dev/pandas/issues/10710 # which doesn't reorder the list-like `q` on the inner level. - order = np.append(np.arange(1, result.index.nlevels), 0) + order = list(range(1, result.index.nlevels)) + [0] + # temporarily saves the index names index_names = np.array(result.index.names) + # set index names to positions to avoid confusion result.index.names = np.arange(len(index_names)) + # place quantiles on the inside result = result.reorder_levels(order) + # restore the index names in order result.index.names = index_names[order] From 1536032dfbc706ba9659b7615a00fa97a3ea30ba Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 27 Dec 2019 00:11:53 +0800 Subject: [PATCH 4/6] TST: parametrized test and added test cases (GH30289) --- pandas/tests/groupby/test_function.py | 32 ++++++++++++++++++++------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index a2423374a4e8f..ca6e3db555673 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1398,18 +1398,34 @@ def test_quantile_array_multiple_levels(): tm.assert_frame_equal(result, expected) -def test_groupby_quantile_with_arraylike_q_and_int_columns(): +@pytest.mark.parametrize("frame_size", [(2, 2), (100, 10)]) +@pytest.mark.parametrize("groupby", [[0], [0, 1]]) +@pytest.mark.parametrize("q", [[0.5, 0.6], [0.1, 0.9]]) +def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): # GH30289 - df = pd.DataFrame(np.array([2 * [_ % 4] for _ in range(10)]), columns=[0, 1]) + nrow, ncol = frame_size + if len(groupby) >= ncol or any([by >= ncol for by in groupby]): + pytest.skip(f"Invalid argument groupby={groupby}") - quantiles = [0.5, 0.6] - expected_index = pd.MultiIndex.from_product( - [[0, 1, 2, 3], [0.5, 0.6]], names=[0, None] + df = pd.DataFrame( + np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol) ) - expected_values = [float(x) for x in [0, 0, 1, 1, 2, 2, 3, 3]] - expected = pd.DataFrame(expected_values, index=expected_index, columns=[1]) - result = df.groupby(0).quantile(quantiles) + idx_levels = [list(range(min(nrow, 4)))] * len(groupby) + [q] + idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [ + list(range(len(q))) * min(nrow, 4) + ] + expected_index = pd.MultiIndex( + levels=idx_levels, codes=idx_codes, names=groupby + [None] + ) + expected_values = [ + [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q + ] + expected_columns = [x for x in range(ncol) if x not in groupby] + expected = pd.DataFrame( + expected_values, index=expected_index, columns=expected_columns + ) + result = df.groupby(groupby).quantile(q) tm.assert_frame_equal(result, expected) From 454a45f40e4f069b15a2975026cdcc18e60fc610 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 27 Dec 2019 00:43:54 +0800 Subject: [PATCH 5/6] CLN: removed unnecessary list comprehension (GH30289) --- pandas/tests/groupby/test_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index ca6e3db555673..51564154755f4 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1404,7 +1404,7 @@ def test_quantile_array_multiple_levels(): def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): # GH30289 nrow, ncol = frame_size - if len(groupby) >= ncol or any([by >= ncol for by in groupby]): + if len(groupby) >= ncol or any(by >= ncol for by in groupby): pytest.skip(f"Invalid argument groupby={groupby}") df = pd.DataFrame( From 9a6e298be881df944e78666b6bbadd16b1ef40ee Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 27 Dec 2019 08:49:30 +0800 Subject: [PATCH 6/6] CLN: cleaned up test cases (GH30289) --- pandas/tests/groupby/test_function.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 51564154755f4..c41c9b4db053a 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1398,15 +1398,12 @@ def test_quantile_array_multiple_levels(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("frame_size", [(2, 2), (100, 10)]) +@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)]) @pytest.mark.parametrize("groupby", [[0], [0, 1]]) -@pytest.mark.parametrize("q", [[0.5, 0.6], [0.1, 0.9]]) +@pytest.mark.parametrize("q", [[0.5, 0.6]]) def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): # GH30289 nrow, ncol = frame_size - if len(groupby) >= ncol or any(by >= ncol for by in groupby): - pytest.skip(f"Invalid argument groupby={groupby}") - df = pd.DataFrame( np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol) )