From 6e540dabc957f4eb56fdd00b94a25a9eac5258ad Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Thu, 18 Jan 2024 13:02:56 +0000 Subject: [PATCH] optimise metrics creation, support additional temporal metrics, expose percentile interpolation --- py-polars/polars/dataframe/frame.py | 219 ++++++++++-------- py-polars/polars/series/series.py | 6 +- .../tests/unit/dataframe/test_describe.py | 38 +-- py-polars/tests/unit/series/test_describe.py | 4 + 4 files changed, 160 insertions(+), 107 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 78077071565a4..af9655d5e7bc2 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -7,6 +7,7 @@ import warnings from collections import OrderedDict, defaultdict from collections.abc import Sized +from functools import lru_cache from io import BytesIO, StringIO, TextIOWrapper from operator import itemgetter from pathlib import Path @@ -4352,7 +4353,10 @@ def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]: return None def describe( - self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75) + self, + percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75), + *, + interpolation: RollingInterpolationMethod = "nearest", ) -> Self: """ Summary statistics for a DataFrame. @@ -4363,15 +4367,17 @@ def describe( One or more percentiles to include in the summary statistics. All values must be in the range `[0, 1]`. + interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'} + Interpolation method used when calculating percentiles. + Notes ----- The median is included by default as the 50% percentile. Warnings -------- - We will never guarantee the output of describe to be stable. - It will show statistics that we deem informative and may - be updated in the future. + We do not guarantee the output of `describe` to be stable. It will show + statistics that we deem informative, and may be updated in the future. See Also -------- @@ -4379,114 +4385,141 @@ def describe( Examples -------- - >>> from datetime import date + >>> from datetime import date, time >>> df = pl.DataFrame( ... { ... "float": [1.0, 2.8, 3.0], - ... "int": [4, 5, None], + ... "int": [40, 50, None], ... "bool": [True, False, True], - ... "str": [None, "b", "c"], - ... "str2": ["usd", "eur", None], + ... "str": ["zz", "xx", "yy"], ... "date": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)], + ... "time": [time(10, 20, 30), time(14, 45, 50), time(23, 15, 10)], ... } ... ) + + Show default frame statistics: + >>> df.describe() shape: (9, 7) - ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ - │ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ - │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 2 ┆ 2 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ - │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 4.0 ┆ null ┆ b ┆ eur ┆ 2020-01-01 │ - │ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ - │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ 2021-07-05 │ - │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ max ┆ 3.0 ┆ 5.0 ┆ null ┆ c ┆ usd ┆ 2022-12-31 │ - └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 ┆ 23:15:10 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ + + Customize which percentiles are displayed, applying linear interpolation: + + >>> df.describe( + ... percentiles=[0.1, 0.3, 0.5, 0.7, 0.9], + ... interpolation="linear", + ... ) + shape: (11, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 10% ┆ 1.36 ┆ 41.0 ┆ null ┆ null ┆ 2020-04-20 ┆ 11:13:34 │ + │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 ┆ 12:59:42 │ + │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 ┆ 18:09:34 │ + │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 ┆ 21:33:18 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ """ if not self.columns: msg = "cannot describe a DataFrame without any columns" raise TypeError(msg) - # Determine which columns should get std/mean/percentile statistics - stat_cols, temporal_cols, bool_cols, numeric_result = set(), set(), set(), set() - schema = self.schema - for c, dt in schema.items(): - if dt.is_numeric(): - stat_cols.add(c) - numeric_result.add(c) - elif dt.is_temporal(): - temporal_cols.add(c) - elif dt == Boolean: - bool_cols.add(c) - numeric_result.add(c) - elif dt == Null or dt.is_nested(): - numeric_result.add(c) - - # Determine metrics and optional/additional percentiles + # create list of metrics metrics = ["count", "null_count", "mean", "std", "min"] - percentile_exprs = [] - for p in parse_percentiles(percentiles): - for c in self.columns: - if c in stat_cols: - expr = F.col(c).quantile(p) - elif p == 0.5 and c in temporal_cols: - expr = F.col(c).to_physical().median().cast(schema[c]) - else: - expr = F.lit(None) - expr = expr.alias(f"{p}:{c}") - percentile_exprs.append(expr) - metrics.append(f"{p*100:g}%") + if quantiles := parse_percentiles(percentiles): + metrics.extend(f"{q * 100:g}%" for q in quantiles) metrics.append("max") - mean_exprs = [ - ( - F.col(c).mean() if (c in stat_cols or c in bool_cols) else F.lit(None) - ).alias(f"mean:{c}") - for c in self.columns - ] - std_exprs = [ - (F.col(c).std() if c in stat_cols else F.lit(None)).alias(f"std:{c}") - for c in self.columns - ] - minmax_cols = { - c - for c, dt in self.schema.items() - if not dt.is_nested() - and dt not in (Object, Null, Unknown, Categorical, Enum, Boolean) - } - min_exprs = [ - (F.col(c).min() if c in minmax_cols else F.lit(None)).alias(f"min:{c}") - for c in self.columns - ] - max_exprs = [ - (F.col(c).max() if c in minmax_cols else F.lit(None)).alias(f"max:{c}") - for c in self.columns - ] + @lru_cache + def skip_minmax(dt: PolarsDataType) -> bool: + return dt.is_nested() or dt in (Object, Null, Unknown, Categorical, Enum) - # Calculate metrics in parallel - df_metrics = self.select( - F.all().count().name.prefix("count:"), - F.all().null_count().name.prefix("null_count:"), - *mean_exprs, - *std_exprs, - *min_exprs, - *percentile_exprs, - *max_exprs, - ) + # determine which columns get std/mean/percentile stats + numeric_result = set() + metric_exprs = [] + null = F.lit(None) + + for c, dt in self.schema.items(): + is_numeric = dt.is_numeric() + is_temporal = not is_numeric and dt.is_temporal() + + # counts + count_exprs = [ + F.col(c).count().name.prefix("count:"), + F.col(c).null_count().name.prefix("null_count:"), + ] + metric_exprs.extend(count_exprs) + + # mean + if is_temporal: + mean_expr = F.col(c).to_physical().mean().cast(dt) + else: + mean_expr = F.col(c).mean() if is_numeric or dt == Boolean else null + metric_exprs.append(mean_expr.alias(f"mean:{c}")) + + # standard deviation + expr_std = F.col(c).std() if is_numeric else null + metric_exprs.append(expr_std.alias(f"std:{c}")) + + # min + min_expr = F.col(c).min() if not skip_minmax(dt) else null + metric_exprs.append(min_expr.alias(f"min:{c}")) + + # percentiles + for p in quantiles: + pct_expr = ( + ( + F.col(c).to_physical().quantile(p, interpolation).cast(dt) + if is_temporal + else F.col(c).quantile(p, interpolation) + ) + if (is_numeric or is_temporal) + else null + ) + metric_exprs.append(pct_expr.alias(f"{p}:{c}")) + + # max + metric_exprs.append( + (F.col(c).max() if not skip_minmax(dt) else null).alias(f"max:{c}") + ) + + if is_numeric or dt.is_nested() or dt in (Null, Boolean): + numeric_result.add(c) + + # calculate metrics in parallel + df_metrics = self.select(*metric_exprs) - # Reshape wide result - described = [ - df_metrics.row(0)[(n * self.width) : (n + 1) * self.width] - for n in range(len(metrics)) + # reshape wide result + n_metrics = len(metrics) + column_metrics = [ + df_metrics.row(0)[(n * n_metrics) : (n + 1) * n_metrics] + for n in range(self.width) ] + summary = dict(zip(self.columns, column_metrics)) - # Cast by column type (numeric/bool -> float), (other -> string) - summary = dict(zip(self.columns, list(zip(*described)))) + # cast by column type (numeric/bool -> float), (other -> string) for c in self.columns: summary[c] = [ # type: ignore[assignment] None @@ -4495,9 +4528,9 @@ def describe( for v in summary[c] ] - # Return results as a DataFrame + # return results as a DataFrame df_summary = self._from_dict(summary) - df_summary.insert_column(0, pl.Series("describe", metrics)) + df_summary.insert_column(0, pl.Series("statistic", metrics)) return df_summary def get_column_index(self, name: str) -> int: diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index a8199a48ea69d..c27bbaab1a400 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -1818,7 +1818,9 @@ def to_frame(self, name: str | None = None) -> DataFrame: return wrap_df(PyDataFrame([self._s])) def describe( - self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75) + self, + percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75), + interpolation: RollingInterpolationMethod = "nearest", ) -> DataFrame: """ Quick summary statistics of a Series. @@ -1831,6 +1833,8 @@ def describe( percentiles One or more percentiles to include in the summary statistics (if the Series has a numeric dtype). All values must be in the range `[0, 1]`. + interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'} + Interpolation method used when calculating percentiles. Notes ----- diff --git a/py-polars/tests/unit/dataframe/test_describe.py b/py-polars/tests/unit/dataframe/test_describe.py index 8254e8df18ebd..bf1cf1d2bf3a2 100644 --- a/py-polars/tests/unit/dataframe/test_describe.py +++ b/py-polars/tests/unit/dataframe/test_describe.py @@ -29,7 +29,7 @@ def test_df_describe() -> None: result = df.describe() expected = pl.DataFrame( { - "describe": [ + "statistic": [ "count", "null_count", "mean", @@ -52,7 +52,7 @@ def test_df_describe() -> None: 3.0, ], "b": [2.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 5.0, 5.0, 5.0], - "c": [3.0, 0.0, 2 / 3, None, None, None, None, None, None], + "c": [3.0, 0.0, 2 / 3, None, False, None, None, None, True], "d": ["2", "1", None, None, "b", None, None, None, "c"], "e": ["2", "1", None, None, None, None, None, None, None], "f": [ @@ -61,9 +61,9 @@ def test_df_describe() -> None: None, None, "2020-01-01 10:30:00", - None, "2021-07-05 15:00:00", - None, + "2021-07-05 15:00:00", + "2022-12-31 20:30:00", "2022-12-31 20:30:00", ], "g": [ @@ -72,12 +72,22 @@ def test_df_describe() -> None: None, None, "2020-01-01", - None, "2021-07-05", - None, + "2021-07-05", + "2022-12-31", "2022-12-31", ], - "h": ["3", "0", None, None, "10:30:00", None, "15:00:00", None, "20:30:00"], + "h": [ + "3", + "0", + None, + None, + "10:30:00", + "15:00:00", + "15:00:00", + "20:30:00", + "20:30:00", + ], } ) assert_frame_equal(result, expected) @@ -103,7 +113,7 @@ def test_df_describe_nested() -> None: ("75%", None, None), ("max", None, None), ], - schema=["describe"] + df.columns, + schema=["statistic"] + df.columns, schema_overrides={"struct": pl.Float64, "list": pl.Float64}, ) assert_frame_equal(result, expected) @@ -126,7 +136,7 @@ def test_df_describe_custom_percentiles() -> None: ("80%", 2.0), ("max", 2.0), ], - schema=["describe"] + df.columns, + schema=["statistic"] + df.columns, ) assert_frame_equal(result, expected) @@ -144,7 +154,7 @@ def test_df_describe_no_percentiles(pcts: list[float] | None) -> None: ("min", 1.0), ("max", 2.0), ], - schema=["describe"] + df.columns, + schema=["statistic"] + df.columns, ) assert_frame_equal(result, expected) @@ -164,7 +174,7 @@ def test_df_describe_empty_column() -> None: ("75%", None), ("max", None), ], - schema=["describe"] + df.columns, + schema=["statistic"] + df.columns, ) assert_frame_equal(result, expected) @@ -180,7 +190,7 @@ def test_df_describe_empty() -> None: def test_df_describe_quantile_precision() -> None: df = pl.DataFrame({"a": range(10)}) result = df.describe(percentiles=[0.99, 0.999, 0.9999]) - result_metrics = result.get_column("describe").to_list() + result_metrics = result.get_column("statistic").to_list() expected_metrics = ["99%", "99.9%", "99.99%"] for m in expected_metrics: assert m in result_metrics @@ -196,5 +206,7 @@ def test_df_describe_object() -> None: result = df.describe(percentiles=(0.05, 0.25, 0.5, 0.75, 0.95)) - expected = pl.DataFrame({"describe": ["count", "null_count"], "object": ["3", "0"]}) + expected = pl.DataFrame( + {"statistic": ["count", "null_count"], "object": ["3", "0"]} + ) assert_frame_equal(result.head(2), expected) diff --git a/py-polars/tests/unit/series/test_describe.py b/py-polars/tests/unit/series/test_describe.py index 00b0c376ee771..06730549089e0 100644 --- a/py-polars/tests/unit/series/test_describe.py +++ b/py-polars/tests/unit/series/test_describe.py @@ -64,6 +64,8 @@ def test_series_describe_boolean() -> None: "count": 4, "null_count": 1, "mean": 0.75, + "min": False, + "max": True, } expected = pl.DataFrame( data={"statistic": stats.keys(), "value": stats.values()}, @@ -80,7 +82,9 @@ def test_series_describe_date() -> None: "count": "3", "null_count": "0", "min": "2021-01-01", + "25%": "2021-01-02", "50%": "2021-01-02", + "75%": "2021-01-03", "max": "2021-01-03", } expected = pl.DataFrame({"statistic": stats.keys(), "value": stats.values()})