From a2320e99fe3609eec08661900e5be800860618d8 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 8 Aug 2020 14:42:10 +0000 Subject: [PATCH 1/5] add test for agg on ordered categorical cols --- .../tests/groupby/aggregate/test_aggregate.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 40a20c8210052..21f1361f111b0 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1061,3 +1061,37 @@ def test_groupby_get_by_index(): res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])}) expected = pd.DataFrame(dict(A=["S", "W"], B=[1.0, 2.0])).set_index("A") pd.testing.assert_frame_equal(res, expected) + + +def test_groupby_agg_categorical_cols(): + """ + test aggregation on ordered categorical + columns #27800 + """ + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg({"nr": ["min", "max"], "cat_ord": "min"}) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + multi_index_tuple = [("nr", "min"), ("nr", "max"), ("cat_ord", "min")] + multi_index = pd.MultiIndex.from_tuples(multi_index_tuple) + + data = np.array([(1, 4, "a"), (5, 8, "c")]) + expected_df = pd.DataFrame(data=data, columns=multi_index, index=cat_index) + expected_df["nr"] = expected_df["nr"].astype("int64") + + tm.assert_frame_equal(result_df, expected_df) From 87c9eaed26dc05c29fa737c909030875774b0631 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 8 Aug 2020 20:56:05 +0000 Subject: [PATCH 2/5] replace np arr and dtype conversion with list --- pandas/tests/groupby/aggregate/test_aggregate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 21f1361f111b0..4d87e2c802615 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1090,8 +1090,7 @@ def test_groupby_agg_categorical_cols(): multi_index_tuple = [("nr", "min"), ("nr", "max"), ("cat_ord", "min")] multi_index = pd.MultiIndex.from_tuples(multi_index_tuple) - data = np.array([(1, 4, "a"), (5, 8, "c")]) + data = [(1, 4, "a"), (5, 8, "c")] expected_df = pd.DataFrame(data=data, columns=multi_index, index=cat_index) - expected_df["nr"] = expected_df["nr"].astype("int64") tm.assert_frame_equal(result_df, expected_df) From ae5c1e3ac67b9d5a11a70845df46bf7e52b071c4 Mon Sep 17 00:00:00 2001 From: root Date: Sat, 15 Aug 2020 13:01:48 +0000 Subject: [PATCH 3/5] add more test cases for ordered categorical columns --- .../tests/groupby/aggregate/test_aggregate.py | 160 +++++++++++++++++- 1 file changed, 157 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 4d87e2c802615..bbc0e4a0ad9e7 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1063,10 +1063,164 @@ def test_groupby_get_by_index(): pd.testing.assert_frame_equal(res, expected) -def test_groupby_agg_categorical_cols(): +def test_groupby_single_agg_numeric_col(): """ - test aggregation on ordered categorical - columns #27800 + test single agg on a numeric column + """ + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg({"nr": "min"}) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + expected_df = pd.DataFrame(data={"nr": [1, 5]}, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + +def test_groupby_single_agg_cat_cols(): + """ + test single agg on a ordered categorical column + """ + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg({"cat_ord": "min"}) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + expected_df = pd.DataFrame(data={"cat_ord": ["a", "c"]}, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + +def test_groupby_combined_single_agg_cat_cols(): + """ + test combined single aggregations on a + numeric and multiple aggregation an ordered + categorical column + """ + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg({"nr": "min", "cat_ord": "min"}) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + expected_df = pd.DataFrame( + data={"nr": [1, 5], "cat_ord": ["a", "c"]}, index=cat_index + ) + + tm.assert_frame_equal(result_df, expected_df) + + +def test_groupby_multiple_agg_cat_cols(): + """ + test multiple aggregations on an ordered categorical column + """ + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg({"cat_ord": ["min", "max"]}) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + multi_index_tuple = [("cat_ord", "min"), ("cat_ord", "max")] + multi_index = pd.MultiIndex.from_tuples(multi_index_tuple) + + data = [("a", "b"), ("c", "d")] + expected_df = pd.DataFrame(data=data, columns=multi_index, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + +def test_groupby_combined_multiple_agg_cat_cols(): + """ + test single aggregations on a numeric and + multiple aggregations an ordered categorical column + """ + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg({"nr": "min", "cat_ord": ["min", "max"]}) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + multi_index_tuple = [("nr", "min"), ("cat_ord", "min"), ("cat_ord", "max")] + multi_index = pd.MultiIndex.from_tuples(multi_index_tuple) + + data = [(1, "a", "b"), (5, "c", "d")] + expected_df = pd.DataFrame(data=data, columns=multi_index, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + +def test_groupby_combined_multiple_numeric_cat_cols(): + """ + test multiple aggregation on numeric and a + single aggregation on an ordered categorical + column #27800 """ # create the result dataframe From 3380aa4862975ccb942752891afb8d322d86ff50 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 17 Aug 2020 22:30:38 +0000 Subject: [PATCH 4/5] empty commit to trigger travis rebuild From 603602df322a80cb7f53cd95df6c8b87f112359b Mon Sep 17 00:00:00 2001 From: root Date: Fri, 21 Aug 2020 16:25:31 +0000 Subject: [PATCH 5/5] parameterize tests for ordered cat col aggregations --- .../tests/groupby/aggregate/test_aggregate.py | 176 ++++-------------- 1 file changed, 34 insertions(+), 142 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e33bb82066bd1..8fe450fe6abfc 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1063,131 +1063,16 @@ def test_groupby_get_by_index(): pd.testing.assert_frame_equal(res, expected) -def test_groupby_single_agg_numeric_col(): - """ - test single agg on a numeric column - """ - - # create the result dataframe - input_df = pd.DataFrame( - { - "nr": [1, 2, 3, 4, 5, 6, 7, 8], - "cat_ord": list("aabbccdd"), - "cat": list("aaaabbbb"), - } - ) - - input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) - input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() - result_df = input_df.groupby("cat").agg({"nr": "min"}) - - # create expected dataframe - cat_index = pd.CategoricalIndex( - ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" - ) - - expected_df = pd.DataFrame(data={"nr": [1, 5]}, index=cat_index) - - tm.assert_frame_equal(result_df, expected_df) - - -def test_groupby_single_agg_cat_cols(): - """ - test single agg on a ordered categorical column - """ - - # create the result dataframe - input_df = pd.DataFrame( - { - "nr": [1, 2, 3, 4, 5, 6, 7, 8], - "cat_ord": list("aabbccdd"), - "cat": list("aaaabbbb"), - } - ) - - input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) - input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() - result_df = input_df.groupby("cat").agg({"cat_ord": "min"}) - - # create expected dataframe - cat_index = pd.CategoricalIndex( - ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" - ) - - expected_df = pd.DataFrame(data={"cat_ord": ["a", "c"]}, index=cat_index) - - tm.assert_frame_equal(result_df, expected_df) - - -def test_groupby_combined_single_agg_cat_cols(): - """ - test combined single aggregations on a - numeric and multiple aggregation an ordered - categorical column - """ - - # create the result dataframe - input_df = pd.DataFrame( - { - "nr": [1, 2, 3, 4, 5, 6, 7, 8], - "cat_ord": list("aabbccdd"), - "cat": list("aaaabbbb"), - } - ) - - input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) - input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() - result_df = input_df.groupby("cat").agg({"nr": "min", "cat_ord": "min"}) - - # create expected dataframe - cat_index = pd.CategoricalIndex( - ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" - ) - - expected_df = pd.DataFrame( - data={"nr": [1, 5], "cat_ord": ["a", "c"]}, index=cat_index - ) - - tm.assert_frame_equal(result_df, expected_df) - - -def test_groupby_multiple_agg_cat_cols(): - """ - test multiple aggregations on an ordered categorical column - """ - - # create the result dataframe - input_df = pd.DataFrame( - { - "nr": [1, 2, 3, 4, 5, 6, 7, 8], - "cat_ord": list("aabbccdd"), - "cat": list("aaaabbbb"), - } - ) - - input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) - input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() - result_df = input_df.groupby("cat").agg({"cat_ord": ["min", "max"]}) - - # create expected dataframe - cat_index = pd.CategoricalIndex( - ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" - ) - - multi_index_tuple = [("cat_ord", "min"), ("cat_ord", "max")] - multi_index = pd.MultiIndex.from_tuples(multi_index_tuple) - - data = [("a", "b"), ("c", "d")] - expected_df = pd.DataFrame(data=data, columns=multi_index, index=cat_index) - - tm.assert_frame_equal(result_df, expected_df) - - -def test_groupby_combined_multiple_agg_cat_cols(): - """ - test single aggregations on a numeric and - multiple aggregations an ordered categorical column - """ +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}), + ({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}), + ({"nr": "min"}, {"nr": [1, 5]}), + ], +) +def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): + # test single aggregations on ordered categorical cols GHGH27800 # create the result dataframe input_df = pd.DataFrame( @@ -1200,28 +1085,28 @@ def test_groupby_combined_multiple_agg_cat_cols(): input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() - result_df = input_df.groupby("cat").agg({"nr": "min", "cat_ord": ["min", "max"]}) + result_df = input_df.groupby("cat").agg(grp_col_dict) # create expected dataframe cat_index = pd.CategoricalIndex( ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" ) - multi_index_tuple = [("nr", "min"), ("cat_ord", "min"), ("cat_ord", "max")] - multi_index = pd.MultiIndex.from_tuples(multi_index_tuple) - - data = [(1, "a", "b"), (5, "c", "d")] - expected_df = pd.DataFrame(data=data, columns=multi_index, index=cat_index) + expected_df = pd.DataFrame(data=exp_data, index=cat_index) tm.assert_frame_equal(result_df, expected_df) -def test_groupby_combined_multiple_numeric_cat_cols(): - """ - test multiple aggregation on numeric and a - single aggregation on an ordered categorical - column #27800 - """ +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]), + ({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]), + ({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]), + ], +) +def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): + # test combined aggregations on ordered categorical cols GH27800 # create the result dataframe input_df = pd.DataFrame( @@ -1234,18 +1119,25 @@ def test_groupby_combined_multiple_numeric_cat_cols(): input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() - result_df = input_df.groupby("cat").agg({"nr": ["min", "max"], "cat_ord": "min"}) + result_df = input_df.groupby("cat").agg(grp_col_dict) # create expected dataframe cat_index = pd.CategoricalIndex( ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" ) - multi_index_tuple = [("nr", "min"), ("nr", "max"), ("cat_ord", "min")] - multi_index = pd.MultiIndex.from_tuples(multi_index_tuple) + # unpack the grp_col_dict to create the multi-index tuple + # this tuple will be used to create the expected dataframe index + multi_index_list = [] + for k, v in grp_col_dict.items(): + if isinstance(v, list): + for value in v: + multi_index_list.append([k, value]) + else: + multi_index_list.append([k, v]) + multi_index = pd.MultiIndex.from_tuples(tuple(multi_index_list)) - data = [(1, 4, "a"), (5, 8, "c")] - expected_df = pd.DataFrame(data=data, columns=multi_index, index=cat_index) + expected_df = pd.DataFrame(data=exp_data, columns=multi_index, index=cat_index) tm.assert_frame_equal(result_df, expected_df)