optimise metrics creation, support additional temporal metrics, expos…

…e percentile interpolation
pola-rs · Jan 18, 2024 · 6e540da · 6e540da
1 parent 493f2ae
commit 6e540da
Show file tree

Hide file tree

Showing 4 changed files with 160 additions and 107 deletions.
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -7,6 +7,7 @@
 import warnings
 from collections import OrderedDict, defaultdict
 from collections.abc import Sized
+from functools import lru_cache
 from io import BytesIO, StringIO, TextIOWrapper
 from operator import itemgetter
 from pathlib import Path
@@ -4352,7 +4353,10 @@ def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]:
         return None
 
     def describe(
-        self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75)
+        self,
+        percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75),
+        *,
+        interpolation: RollingInterpolationMethod = "nearest",
     ) -> Self:
         """
         Summary statistics for a DataFrame.
@@ -4363,130 +4367,159 @@ def describe(
             One or more percentiles to include in the summary statistics.
             All values must be in the range `[0, 1]`.
 
+        interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
+            Interpolation method used when calculating percentiles.
+
         Notes
         -----
         The median is included by default as the 50% percentile.
 
         Warnings
         --------
-        We will never guarantee the output of describe to be stable.
-        It will show statistics that we deem informative and may
-        be updated in the future.
+        We do not guarantee the output of `describe` to be stable. It will show
+        statistics that we deem informative, and may be updated in the future.
 
         See Also
         --------
         glimpse
 
         Examples
         --------
-        >>> from datetime import date
+        >>> from datetime import date, time
         >>> df = pl.DataFrame(
         ...     {
         ...         "float": [1.0, 2.8, 3.0],
-        ...         "int": [4, 5, None],
+        ...         "int": [40, 50, None],
         ...         "bool": [True, False, True],
-        ...         "str": [None, "b", "c"],
-        ...         "str2": ["usd", "eur", None],
+        ...         "str": ["zz", "xx", "yy"],
         ...         "date": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)],
+        ...         "time": [time(10, 20, 30), time(14, 45, 50), time(23, 15, 10)],
         ...     }
         ... )
+
+        Show default frame statistics:
+
         >>> df.describe()
         shape: (9, 7)
-        ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐
-        │ describe   ┆ float    ┆ int      ┆ bool     ┆ str  ┆ str2 ┆ date       │
-        │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---  ┆ ---        │
-        │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str  ┆ str        │
-        ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
-        │ count      ┆ 3.0      ┆ 2.0      ┆ 3.0      ┆ 2    ┆ 2    ┆ 3          │
-        │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 1    ┆ 1    ┆ 0          │
-        │ mean       ┆ 2.266667 ┆ 4.5      ┆ 0.666667 ┆ null ┆ null ┆ null       │
-        │ std        ┆ 1.101514 ┆ 0.707107 ┆ null     ┆ null ┆ null ┆ null       │
-        │ min        ┆ 1.0      ┆ 4.0      ┆ null     ┆ b    ┆ eur  ┆ 2020-01-01 │
-        │ 25%        ┆ 2.8      ┆ 4.0      ┆ null     ┆ null ┆ null ┆ null       │
-        │ 50%        ┆ 2.8      ┆ 5.0      ┆ null     ┆ null ┆ null ┆ 2021-07-05 │
-        │ 75%        ┆ 3.0      ┆ 5.0      ┆ null     ┆ null ┆ null ┆ null       │
-        │ max        ┆ 3.0      ┆ 5.0      ┆ null     ┆ c    ┆ usd  ┆ 2022-12-31 │
-        └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
+        ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐
+        │ statistic  ┆ float    ┆ int      ┆ bool     ┆ str  ┆ date       ┆ time     │
+        │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---        ┆ ---      │
+        │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str        ┆ str      │
+        ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡
+        │ count      ┆ 3.0      ┆ 2.0      ┆ 3.0      ┆ 3    ┆ 3          ┆ 3        │
+        │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 0    ┆ 0          ┆ 0        │
+        │ mean       ┆ 2.266667 ┆ 45.0     ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │
+        │ std        ┆ 1.101514 ┆ 7.071068 ┆ null     ┆ null ┆ null       ┆ null     │
+        │ min        ┆ 1.0      ┆ 40.0     ┆ 0.0      ┆ xx   ┆ 2020-01-01 ┆ 10:20:30 │
+        │ 25%        ┆ 2.8      ┆ 40.0     ┆ null     ┆ null ┆ 2021-07-05 ┆ 14:45:50 │
+        │ 50%        ┆ 2.8      ┆ 50.0     ┆ null     ┆ null ┆ 2021-07-05 ┆ 14:45:50 │
+        │ 75%        ┆ 3.0      ┆ 50.0     ┆ null     ┆ null ┆ 2022-12-31 ┆ 23:15:10 │
+        │ max        ┆ 3.0      ┆ 50.0     ┆ 1.0      ┆ zz   ┆ 2022-12-31 ┆ 23:15:10 │
+        └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘
+
+        Customize which percentiles are displayed, applying linear interpolation:
+
+        >>> df.describe(
+        ...     percentiles=[0.1, 0.3, 0.5, 0.7, 0.9],
+        ...     interpolation="linear",
+        ... )
+        shape: (11, 7)
+        ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐
+        │ statistic  ┆ float    ┆ int      ┆ bool     ┆ str  ┆ date       ┆ time     │
+        │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---        ┆ ---      │
+        │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str        ┆ str      │
+        ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡
+        │ count      ┆ 3.0      ┆ 2.0      ┆ 3.0      ┆ 3    ┆ 3          ┆ 3        │
+        │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 0    ┆ 0          ┆ 0        │
+        │ mean       ┆ 2.266667 ┆ 45.0     ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │
+        │ std        ┆ 1.101514 ┆ 7.071068 ┆ null     ┆ null ┆ null       ┆ null     │
+        │ min        ┆ 1.0      ┆ 40.0     ┆ 0.0      ┆ xx   ┆ 2020-01-01 ┆ 10:20:30 │
+        │ 10%        ┆ 1.36     ┆ 41.0     ┆ null     ┆ null ┆ 2020-04-20 ┆ 11:13:34 │
+        │ 30%        ┆ 2.08     ┆ 43.0     ┆ null     ┆ null ┆ 2020-11-26 ┆ 12:59:42 │
+        │ 50%        ┆ 2.8      ┆ 45.0     ┆ null     ┆ null ┆ 2021-07-05 ┆ 14:45:50 │
+        │ 70%        ┆ 2.88     ┆ 47.0     ┆ null     ┆ null ┆ 2022-02-07 ┆ 18:09:34 │
+        │ 90%        ┆ 2.96     ┆ 49.0     ┆ null     ┆ null ┆ 2022-09-13 ┆ 21:33:18 │
+        │ max        ┆ 3.0      ┆ 50.0     ┆ 1.0      ┆ zz   ┆ 2022-12-31 ┆ 23:15:10 │
+        └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘
         """
         if not self.columns:
             msg = "cannot describe a DataFrame without any columns"
             raise TypeError(msg)
 
-        # Determine which columns should get std/mean/percentile statistics
-        stat_cols, temporal_cols, bool_cols, numeric_result = set(), set(), set(), set()
-        schema = self.schema
-        for c, dt in schema.items():
-            if dt.is_numeric():
-                stat_cols.add(c)
-                numeric_result.add(c)
-            elif dt.is_temporal():
-                temporal_cols.add(c)
-            elif dt == Boolean:
-                bool_cols.add(c)
-                numeric_result.add(c)
-            elif dt == Null or dt.is_nested():
-                numeric_result.add(c)
-
-        # Determine metrics and optional/additional percentiles
+        # create list of metrics
         metrics = ["count", "null_count", "mean", "std", "min"]
-        percentile_exprs = []
-        for p in parse_percentiles(percentiles):
-            for c in self.columns:
-                if c in stat_cols:
-                    expr = F.col(c).quantile(p)
-                elif p == 0.5 and c in temporal_cols:
-                    expr = F.col(c).to_physical().median().cast(schema[c])
-                else:
-                    expr = F.lit(None)
-                expr = expr.alias(f"{p}:{c}")
-                percentile_exprs.append(expr)
-            metrics.append(f"{p*100:g}%")
+        if quantiles := parse_percentiles(percentiles):
+            metrics.extend(f"{q * 100:g}%" for q in quantiles)
         metrics.append("max")
 
-        mean_exprs = [
-            (
-                F.col(c).mean() if (c in stat_cols or c in bool_cols) else F.lit(None)
-            ).alias(f"mean:{c}")
-            for c in self.columns
-        ]
-        std_exprs = [
-            (F.col(c).std() if c in stat_cols else F.lit(None)).alias(f"std:{c}")
-            for c in self.columns
-        ]
-        minmax_cols = {
-            c
-            for c, dt in self.schema.items()
-            if not dt.is_nested()
-            and dt not in (Object, Null, Unknown, Categorical, Enum, Boolean)
-        }
-        min_exprs = [
-            (F.col(c).min() if c in minmax_cols else F.lit(None)).alias(f"min:{c}")
-            for c in self.columns
-        ]
-        max_exprs = [
-            (F.col(c).max() if c in minmax_cols else F.lit(None)).alias(f"max:{c}")
-            for c in self.columns
-        ]
+        @lru_cache
+        def skip_minmax(dt: PolarsDataType) -> bool:
+            return dt.is_nested() or dt in (Object, Null, Unknown, Categorical, Enum)
 
-        # Calculate metrics in parallel
-        df_metrics = self.select(
-            F.all().count().name.prefix("count:"),
-            F.all().null_count().name.prefix("null_count:"),
-            *mean_exprs,
-            *std_exprs,
-            *min_exprs,
-            *percentile_exprs,
-            *max_exprs,
-        )
+        # determine which columns get std/mean/percentile stats
+        numeric_result = set()
+        metric_exprs = []
+        null = F.lit(None)
+
+        for c, dt in self.schema.items():
+            is_numeric = dt.is_numeric()
+            is_temporal = not is_numeric and dt.is_temporal()
+
+            # counts
+            count_exprs = [
+                F.col(c).count().name.prefix("count:"),
+                F.col(c).null_count().name.prefix("null_count:"),
+            ]
+            metric_exprs.extend(count_exprs)
+
+            # mean
+            if is_temporal:
+                mean_expr = F.col(c).to_physical().mean().cast(dt)
+            else:
+                mean_expr = F.col(c).mean() if is_numeric or dt == Boolean else null
+            metric_exprs.append(mean_expr.alias(f"mean:{c}"))
+
+            # standard deviation
+            expr_std = F.col(c).std() if is_numeric else null
+            metric_exprs.append(expr_std.alias(f"std:{c}"))
+
+            # min
+            min_expr = F.col(c).min() if not skip_minmax(dt) else null
+            metric_exprs.append(min_expr.alias(f"min:{c}"))
+
+            # percentiles
+            for p in quantiles:
+                pct_expr = (
+                    (
+                        F.col(c).to_physical().quantile(p, interpolation).cast(dt)
+                        if is_temporal
+                        else F.col(c).quantile(p, interpolation)
+                    )
+                    if (is_numeric or is_temporal)
+                    else null
+                )
+                metric_exprs.append(pct_expr.alias(f"{p}:{c}"))
+
+            # max
+            metric_exprs.append(
+                (F.col(c).max() if not skip_minmax(dt) else null).alias(f"max:{c}")
+            )
+
+            if is_numeric or dt.is_nested() or dt in (Null, Boolean):
+                numeric_result.add(c)
+
+        # calculate metrics in parallel
+        df_metrics = self.select(*metric_exprs)
 
-        # Reshape wide result
-        described = [
-            df_metrics.row(0)[(n * self.width) : (n + 1) * self.width]
-            for n in range(len(metrics))
+        # reshape wide result
+        n_metrics = len(metrics)
+        column_metrics = [
+            df_metrics.row(0)[(n * n_metrics) : (n + 1) * n_metrics]
+            for n in range(self.width)
         ]
+        summary = dict(zip(self.columns, column_metrics))
 
-        # Cast by column type (numeric/bool -> float), (other -> string)
-        summary = dict(zip(self.columns, list(zip(*described))))
+        # cast by column type (numeric/bool -> float), (other -> string)
         for c in self.columns:
             summary[c] = [  # type: ignore[assignment]
                 None
@@ -4495,9 +4528,9 @@ def describe(
                 for v in summary[c]
             ]
 
-        # Return results as a DataFrame
+        # return results as a DataFrame
         df_summary = self._from_dict(summary)
-        df_summary.insert_column(0, pl.Series("describe", metrics))
+        df_summary.insert_column(0, pl.Series("statistic", metrics))
         return df_summary
 
     def get_column_index(self, name: str) -> int:

diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
@@ -1818,7 +1818,9 @@ def to_frame(self, name: str | None = None) -> DataFrame:
         return wrap_df(PyDataFrame([self._s]))
 
     def describe(
-        self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75)
+        self,
+        percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75),
+        interpolation: RollingInterpolationMethod = "nearest",
     ) -> DataFrame:
         """
         Quick summary statistics of a Series.
@@ -1831,6 +1833,8 @@ def describe(
         percentiles
             One or more percentiles to include in the summary statistics (if the
             Series has a numeric dtype). All values must be in the range `[0, 1]`.
+        interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
+            Interpolation method used when calculating percentiles.
 
         Notes
         -----