Skip to content

Commit

Permalink
optimise metrics creation, support additional temporal metrics, expos…
Browse files Browse the repository at this point in the history
…e percentile interpolation
  • Loading branch information
alexander-beedie committed Jan 18, 2024
1 parent 493f2ae commit 6e540da
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 107 deletions.
219 changes: 126 additions & 93 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import warnings
from collections import OrderedDict, defaultdict
from collections.abc import Sized
from functools import lru_cache
from io import BytesIO, StringIO, TextIOWrapper
from operator import itemgetter
from pathlib import Path
Expand Down Expand Up @@ -4352,7 +4353,10 @@ def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]:
return None

def describe(
self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75)
self,
percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75),
*,
interpolation: RollingInterpolationMethod = "nearest",
) -> Self:
"""
Summary statistics for a DataFrame.
Expand All @@ -4363,130 +4367,159 @@ def describe(
One or more percentiles to include in the summary statistics.
All values must be in the range `[0, 1]`.
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
Interpolation method used when calculating percentiles.
Notes
-----
The median is included by default as the 50% percentile.
Warnings
--------
We will never guarantee the output of describe to be stable.
It will show statistics that we deem informative and may
be updated in the future.
We do not guarantee the output of `describe` to be stable. It will show
statistics that we deem informative, and may be updated in the future.
See Also
--------
glimpse
Examples
--------
>>> from datetime import date
>>> from datetime import date, time
>>> df = pl.DataFrame(
... {
... "float": [1.0, 2.8, 3.0],
... "int": [4, 5, None],
... "int": [40, 50, None],
... "bool": [True, False, True],
... "str": [None, "b", "c"],
... "str2": ["usd", "eur", None],
... "str": ["zz", "xx", "yy"],
... "date": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)],
... "time": [time(10, 20, 30), time(14, 45, 50), time(23, 15, 10)],
... }
... )
Show default frame statistics:
>>> df.describe()
shape: (9, 7)
┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐
│ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │
╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡
│ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 2 ┆ 2 ┆ 3 │
│ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │
│ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │
│ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │
│ min ┆ 1.0 ┆ 4.0 ┆ null ┆ b ┆ eur ┆ 2020-01-01 │
│ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │
│ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ 2021-07-05 │
│ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │
│ max ┆ 3.0 ┆ 5.0 ┆ null ┆ c ┆ usd ┆ 2022-12-31 │
└────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐
│ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │
╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡
│ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │
│ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │
│ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │
│ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │
│ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │
│ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │
│ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │
│ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 ┆ 23:15:10 │
│ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │
└────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘
Customize which percentiles are displayed, applying linear interpolation:
>>> df.describe(
... percentiles=[0.1, 0.3, 0.5, 0.7, 0.9],
... interpolation="linear",
... )
shape: (11, 7)
┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐
│ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │
╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡
│ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │
│ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │
│ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │
│ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │
│ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │
│ 10% ┆ 1.36 ┆ 41.0 ┆ null ┆ null ┆ 2020-04-20 ┆ 11:13:34 │
│ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 ┆ 12:59:42 │
│ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │
│ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 ┆ 18:09:34 │
│ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 ┆ 21:33:18 │
│ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │
└────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘
"""
if not self.columns:
msg = "cannot describe a DataFrame without any columns"
raise TypeError(msg)

# Determine which columns should get std/mean/percentile statistics
stat_cols, temporal_cols, bool_cols, numeric_result = set(), set(), set(), set()
schema = self.schema
for c, dt in schema.items():
if dt.is_numeric():
stat_cols.add(c)
numeric_result.add(c)
elif dt.is_temporal():
temporal_cols.add(c)
elif dt == Boolean:
bool_cols.add(c)
numeric_result.add(c)
elif dt == Null or dt.is_nested():
numeric_result.add(c)

# Determine metrics and optional/additional percentiles
# create list of metrics
metrics = ["count", "null_count", "mean", "std", "min"]
percentile_exprs = []
for p in parse_percentiles(percentiles):
for c in self.columns:
if c in stat_cols:
expr = F.col(c).quantile(p)
elif p == 0.5 and c in temporal_cols:
expr = F.col(c).to_physical().median().cast(schema[c])
else:
expr = F.lit(None)
expr = expr.alias(f"{p}:{c}")
percentile_exprs.append(expr)
metrics.append(f"{p*100:g}%")
if quantiles := parse_percentiles(percentiles):
metrics.extend(f"{q * 100:g}%" for q in quantiles)
metrics.append("max")

mean_exprs = [
(
F.col(c).mean() if (c in stat_cols or c in bool_cols) else F.lit(None)
).alias(f"mean:{c}")
for c in self.columns
]
std_exprs = [
(F.col(c).std() if c in stat_cols else F.lit(None)).alias(f"std:{c}")
for c in self.columns
]
minmax_cols = {
c
for c, dt in self.schema.items()
if not dt.is_nested()
and dt not in (Object, Null, Unknown, Categorical, Enum, Boolean)
}
min_exprs = [
(F.col(c).min() if c in minmax_cols else F.lit(None)).alias(f"min:{c}")
for c in self.columns
]
max_exprs = [
(F.col(c).max() if c in minmax_cols else F.lit(None)).alias(f"max:{c}")
for c in self.columns
]
@lru_cache
def skip_minmax(dt: PolarsDataType) -> bool:
return dt.is_nested() or dt in (Object, Null, Unknown, Categorical, Enum)

# Calculate metrics in parallel
df_metrics = self.select(
F.all().count().name.prefix("count:"),
F.all().null_count().name.prefix("null_count:"),
*mean_exprs,
*std_exprs,
*min_exprs,
*percentile_exprs,
*max_exprs,
)
# determine which columns get std/mean/percentile stats
numeric_result = set()
metric_exprs = []
null = F.lit(None)

for c, dt in self.schema.items():
is_numeric = dt.is_numeric()
is_temporal = not is_numeric and dt.is_temporal()

# counts
count_exprs = [
F.col(c).count().name.prefix("count:"),
F.col(c).null_count().name.prefix("null_count:"),
]
metric_exprs.extend(count_exprs)

# mean
if is_temporal:
mean_expr = F.col(c).to_physical().mean().cast(dt)
else:
mean_expr = F.col(c).mean() if is_numeric or dt == Boolean else null
metric_exprs.append(mean_expr.alias(f"mean:{c}"))

# standard deviation
expr_std = F.col(c).std() if is_numeric else null
metric_exprs.append(expr_std.alias(f"std:{c}"))

# min
min_expr = F.col(c).min() if not skip_minmax(dt) else null
metric_exprs.append(min_expr.alias(f"min:{c}"))

# percentiles
for p in quantiles:
pct_expr = (
(
F.col(c).to_physical().quantile(p, interpolation).cast(dt)
if is_temporal
else F.col(c).quantile(p, interpolation)
)
if (is_numeric or is_temporal)
else null
)
metric_exprs.append(pct_expr.alias(f"{p}:{c}"))

# max
metric_exprs.append(
(F.col(c).max() if not skip_minmax(dt) else null).alias(f"max:{c}")
)

if is_numeric or dt.is_nested() or dt in (Null, Boolean):
numeric_result.add(c)

# calculate metrics in parallel
df_metrics = self.select(*metric_exprs)

# Reshape wide result
described = [
df_metrics.row(0)[(n * self.width) : (n + 1) * self.width]
for n in range(len(metrics))
# reshape wide result
n_metrics = len(metrics)
column_metrics = [
df_metrics.row(0)[(n * n_metrics) : (n + 1) * n_metrics]
for n in range(self.width)
]
summary = dict(zip(self.columns, column_metrics))

# Cast by column type (numeric/bool -> float), (other -> string)
summary = dict(zip(self.columns, list(zip(*described))))
# cast by column type (numeric/bool -> float), (other -> string)
for c in self.columns:
summary[c] = [ # type: ignore[assignment]
None
Expand All @@ -4495,9 +4528,9 @@ def describe(
for v in summary[c]
]

# Return results as a DataFrame
# return results as a DataFrame
df_summary = self._from_dict(summary)
df_summary.insert_column(0, pl.Series("describe", metrics))
df_summary.insert_column(0, pl.Series("statistic", metrics))
return df_summary

def get_column_index(self, name: str) -> int:
Expand Down
6 changes: 5 additions & 1 deletion py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1818,7 +1818,9 @@ def to_frame(self, name: str | None = None) -> DataFrame:
return wrap_df(PyDataFrame([self._s]))

def describe(
self, percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75)
self,
percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75),
interpolation: RollingInterpolationMethod = "nearest",
) -> DataFrame:
"""
Quick summary statistics of a Series.
Expand All @@ -1831,6 +1833,8 @@ def describe(
percentiles
One or more percentiles to include in the summary statistics (if the
Series has a numeric dtype). All values must be in the range `[0, 1]`.
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'}
Interpolation method used when calculating percentiles.
Notes
-----
Expand Down
Loading

0 comments on commit 6e540da

Please sign in to comment.