apache · zhaoyongjie · Feb 17, 2022 · Feb 17, 2022
diff --git a/tests/integration_tests/pandas_postprocessing_tests.py b/tests/integration_tests/pandas_postprocessing_tests.py
diff --git a/.../integration_tests/fixtures/dataframes.py → tests/unit_tests/fixtures/dataframes.py b/.../integration_tests/fixtures/dataframes.py → tests/unit_tests/fixtures/dataframes.py
diff --git a/tests/unit_tests/pandas_postprocessing/__init__.py b/tests/unit_tests/pandas_postprocessing/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/tests/unit_tests/pandas_postprocessing/test_aggregate.py b/tests/unit_tests/pandas_postprocessing/test_aggregate.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from superset.utils.pandas_postprocessing import aggregate
+from tests.unit_tests.fixtures.dataframes import categories_df
+from tests.unit_tests.pandas_postprocessing.utils import series_to_list
+
+
+def test_aggregate():
+    aggregates = {
+        "asc sum": {"column": "asc_idx", "operator": "sum"},
+        "asc q2": {
+            "column": "asc_idx",
+            "operator": "percentile",
+            "options": {"q": 75},
+        },
+        "desc q1": {
+            "column": "desc_idx",
+            "operator": "percentile",
+            "options": {"q": 25},
+        },
+    }
+    df = aggregate(df=categories_df, groupby=["constant"], aggregates=aggregates)
+    assert df.columns.tolist() == ["constant", "asc sum", "asc q2", "desc q1"]
+    assert series_to_list(df["asc sum"])[0] == 5050
+    assert series_to_list(df["asc q2"])[0] == 75
+    assert series_to_list(df["desc q1"])[0] == 25
diff --git a/tests/unit_tests/pandas_postprocessing/test_boxplot.py b/tests/unit_tests/pandas_postprocessing/test_boxplot.py
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+from superset.exceptions import QueryObjectValidationError
+from superset.utils.core import PostProcessingBoxplotWhiskerType
+from superset.utils.pandas_postprocessing import boxplot
+from tests.unit_tests.fixtures.dataframes import names_df
+
+
+def test_boxplot_tukey():
+    df = boxplot(
+        df=names_df,
+        groupby=["region"],
+        whisker_type=PostProcessingBoxplotWhiskerType.TUKEY,
+        metrics=["cars"],
+    )
+    columns = {column for column in df.columns}
+    assert columns == {
+        "cars__mean",
+        "cars__median",
+        "cars__q1",
+        "cars__q3",
+        "cars__max",
+        "cars__min",
+        "cars__count",
+        "cars__outliers",
+        "region",
+    }
+    assert len(df) == 4
+
+
+def test_boxplot_min_max():
+    df = boxplot(
+        df=names_df,
+        groupby=["region"],
+        whisker_type=PostProcessingBoxplotWhiskerType.MINMAX,
+        metrics=["cars"],
+    )
+    columns = {column for column in df.columns}
+    assert columns == {
+        "cars__mean",
+        "cars__median",
+        "cars__q1",
+        "cars__q3",
+        "cars__max",
+        "cars__min",
+        "cars__count",
+        "cars__outliers",
+        "region",
+    }
+    assert len(df) == 4
+
+
+def test_boxplot_percentile():
+    df = boxplot(
+        df=names_df,
+        groupby=["region"],
+        whisker_type=PostProcessingBoxplotWhiskerType.PERCENTILE,
+        metrics=["cars"],
+        percentiles=[1, 99],
+    )
+    columns = {column for column in df.columns}
+    assert columns == {
+        "cars__mean",
+        "cars__median",
+        "cars__q1",
+        "cars__q3",
+        "cars__max",
+        "cars__min",
+        "cars__count",
+        "cars__outliers",
+        "region",
+    }
+    assert len(df) == 4
+
+
+def test_boxplot_percentile_incorrect_params():
+    with pytest.raises(QueryObjectValidationError):
+        boxplot(
+            df=names_df,
+            groupby=["region"],
+            whisker_type=PostProcessingBoxplotWhiskerType.PERCENTILE,
+            metrics=["cars"],
+        )
+
+    with pytest.raises(QueryObjectValidationError):
+        boxplot(
+            df=names_df,
+            groupby=["region"],
+            whisker_type=PostProcessingBoxplotWhiskerType.PERCENTILE,
+            metrics=["cars"],
+            percentiles=[10],
+        )
+
+    with pytest.raises(QueryObjectValidationError):
+        boxplot(
+            df=names_df,
+            groupby=["region"],
+            whisker_type=PostProcessingBoxplotWhiskerType.PERCENTILE,
+            metrics=["cars"],
+            percentiles=[90, 10],
+        )
+
+    with pytest.raises(QueryObjectValidationError):
+        boxplot(
+            df=names_df,
+            groupby=["region"],
+            whisker_type=PostProcessingBoxplotWhiskerType.PERCENTILE,
+            metrics=["cars"],
+            percentiles=[10, 90, 10],
+        )
diff --git a/tests/unit_tests/pandas_postprocessing/test_compare.py b/tests/unit_tests/pandas_postprocessing/test_compare.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from superset.utils.pandas_postprocessing import compare
+from tests.unit_tests.fixtures.dataframes import timeseries_df2
+from tests.unit_tests.pandas_postprocessing.utils import series_to_list
+
+
+def test_compare():
+    # `difference` comparison
+    post_df = compare(
+        df=timeseries_df2,
+        source_columns=["y"],
+        compare_columns=["z"],
+        compare_type="difference",
+    )
+    assert post_df.columns.tolist() == ["label", "y", "z", "difference__y__z"]
+    assert series_to_list(post_df["difference__y__z"]) == [0.0, -2.0, -8.0, -6.0]
+
+    # drop original columns
+    post_df = compare(
+        df=timeseries_df2,
+        source_columns=["y"],
+        compare_columns=["z"],
+        compare_type="difference",
+        drop_original_columns=True,
+    )
+    assert post_df.columns.tolist() == ["label", "difference__y__z"]
+
+    # `percentage` comparison
+    post_df = compare(
+        df=timeseries_df2,
+        source_columns=["y"],
+        compare_columns=["z"],
+        compare_type="percentage",
+    )
+    assert post_df.columns.tolist() == ["label", "y", "z", "percentage__y__z"]
+    assert series_to_list(post_df["percentage__y__z"]) == [0.0, -0.5, -0.8, -0.75]
+
+    # `ratio` comparison
+    post_df = compare(
+        df=timeseries_df2,
+        source_columns=["y"],
+        compare_columns=["z"],
+        compare_type="ratio",
+    )
+    assert post_df.columns.tolist() == ["label", "y", "z", "ratio__y__z"]
+    assert series_to_list(post_df["ratio__y__z"]) == [1.0, 0.5, 0.2, 0.25]
diff --git a/tests/unit_tests/pandas_postprocessing/test_contribution.py b/tests/unit_tests/pandas_postprocessing/test_contribution.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from datetime import datetime
+
+import pytest
+from pandas import DataFrame
+
+from superset.exceptions import QueryObjectValidationError
+from superset.utils.core import DTTM_ALIAS, PostProcessingContributionOrientation
+from superset.utils.pandas_postprocessing import contribution
+
+
+def test_contribution():
+    df = DataFrame(
+        {
+            DTTM_ALIAS: [datetime(2020, 7, 16, 14, 49), datetime(2020, 7, 16, 14, 50),],
+            "a": [1, 3],
+            "b": [1, 9],
+        }
+    )
+    with pytest.raises(QueryObjectValidationError, match="not numeric"):
+        contribution(df, columns=[DTTM_ALIAS])
+
+    with pytest.raises(QueryObjectValidationError, match="same length"):
+        contribution(df, columns=["a"], rename_columns=["aa", "bb"])
+
+    # cell contribution across row
+    processed_df = contribution(
+        df, orientation=PostProcessingContributionOrientation.ROW,
+    )
+    assert processed_df.columns.tolist() == [DTTM_ALIAS, "a", "b"]
+    assert processed_df["a"].tolist() == [0.5, 0.25]
+    assert processed_df["b"].tolist() == [0.5, 0.75]
+
+    # cell contribution across column without temporal column
+    df.pop(DTTM_ALIAS)
+    processed_df = contribution(
+        df, orientation=PostProcessingContributionOrientation.COLUMN
+    )
+    assert processed_df.columns.tolist() == ["a", "b"]
+    assert processed_df["a"].tolist() == [0.25, 0.75]
+    assert processed_df["b"].tolist() == [0.1, 0.9]
+
+    # contribution only on selected columns
+    processed_df = contribution(
+        df,
+        orientation=PostProcessingContributionOrientation.COLUMN,
+        columns=["a"],
+        rename_columns=["pct_a"],
+    )
+    assert processed_df.columns.tolist() == ["a", "b", "pct_a"]
+    assert processed_df["a"].tolist() == [1, 3]
+    assert processed_df["b"].tolist() == [1, 9]
+    assert processed_df["pct_a"].tolist() == [0.25, 0.75]