Skip to content

Commit

Permalink
feat(python): support calling describe on a LazyFrame (#13982)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie committed Jan 25, 2024
1 parent 0c70388 commit 06c0190
Show file tree
Hide file tree
Showing 3 changed files with 224 additions and 112 deletions.
111 changes: 6 additions & 105 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import random
from collections import OrderedDict, defaultdict
from collections.abc import Sized
from functools import lru_cache
from io import BytesIO, StringIO, TextIOWrapper
from operator import itemgetter
from pathlib import Path
Expand Down Expand Up @@ -39,13 +38,9 @@
INTEGER_DTYPES,
N_INFER_DEFAULT,
Boolean,
Categorical,
Enum,
Float64,
Null,
Object,
String,
Unknown,
py_type_to_dtype,
)
from polars.dependencies import (
Expand Down Expand Up @@ -114,7 +109,6 @@
is_int_sequence,
is_str_sequence,
normalize_filepath,
parse_percentiles,
parse_version,
range_to_slice,
scale_bytes,
Expand Down Expand Up @@ -4361,7 +4355,7 @@ def describe(
percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75),
*,
interpolation: RollingInterpolationMethod = "nearest",
) -> Self:
) -> DataFrame:
"""
Summary statistics for a DataFrame.
Expand All @@ -4382,6 +4376,8 @@ def describe(
--------
We do not guarantee the output of `describe` to be stable. It will show
statistics that we deem informative, and may be updated in the future.
Using `describe` programmatically (versus interactive exploration) is
not recommended for this reason.
See Also
--------
Expand Down Expand Up @@ -4447,108 +4443,13 @@ def describe(
└────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘
"""
if not self.columns:
msg = "cannot describe a DataFrame without any columns"
msg = "cannot describe a DataFrame that has no columns"
raise TypeError(msg)

# create list of metrics
metrics = ["count", "null_count", "mean", "std", "min"]
if quantiles := parse_percentiles(percentiles):
metrics.extend(f"{q * 100:g}%" for q in quantiles)
metrics.append("max")

@lru_cache
def skip_minmax(dt: PolarsDataType) -> bool:
return dt.is_nested() or dt in (Categorical, Enum, Null, Object, Unknown)

# determine which columns will produce std/mean/percentile/etc
# statistics in a single pass over the frame schema
has_numeric_result, sort_cols = set(), set()
metric_exprs: list[Expr] = []
null = F.lit(None)

for c, dtype in self.schema.items():
is_numeric = dtype.is_numeric()
is_temporal = not is_numeric and dtype.is_temporal()

# counts
count_exprs = [
F.col(c).count().name.prefix("count:"),
F.col(c).null_count().name.prefix("null_count:"),
]
# mean
mean_expr = (
F.col(c).to_physical().mean().cast(dtype)
if is_temporal
else (F.col(c).mean() if is_numeric or dtype == Boolean else null)
)

# standard deviation, min, max
expr_std = F.col(c).std() if is_numeric else null
min_expr = F.col(c).min() if not skip_minmax(dtype) else null
max_expr = F.col(c).max() if not skip_minmax(dtype) else null

# percentiles
pct_exprs = []
for p in quantiles:
if is_numeric or is_temporal:
pct_expr = (
F.col(c).to_physical().quantile(p, interpolation).cast(dtype)
if is_temporal
else F.col(c).quantile(p, interpolation)
)
sort_cols.add(c)
else:
pct_expr = null
pct_exprs.append(pct_expr.alias(f"{p}:{c}"))

if is_numeric or dtype.is_nested() or dtype in (Null, Boolean):
has_numeric_result.add(c)

# add column expressions (in end-state 'metrics' list order)
metric_exprs.extend(
[
*count_exprs,
mean_expr.alias(f"mean:{c}"),
expr_std.alias(f"std:{c}"),
min_expr.alias(f"min:{c}"),
*pct_exprs,
max_expr.alias(f"max:{c}"),
]
)

# if more than one quantile requested, sort relevant columns to make them O(1)
# TODO: remove once we have engine support for retrieving multiples quantiles
lf = (
self.lazy().with_columns(F.col(c).sort() for c in sort_cols)
if sort_cols
else self.lazy()
return self.lazy().describe(
percentiles=percentiles, interpolation=interpolation
)

# calculate metrics in parallel
df_metrics = lf.select(*metric_exprs).collect()

# reshape wide result
n_metrics = len(metrics)
column_metrics = [
df_metrics.row(0)[(n * n_metrics) : (n + 1) * n_metrics]
for n in range(self.width)
]
summary = dict(zip(self.columns, column_metrics))

# cast by column type (numeric/bool -> float), (other -> string)
for c in self.columns:
summary[c] = [ # type: ignore[assignment]
None
if (v is None or isinstance(v, dict))
else (float(v) if (c in has_numeric_result) else str(v))
for v in summary[c]
]

# return results as a DataFrame
df_summary = self._from_dict(summary)
df_summary.insert_column(0, pl.Series("statistic", metrics))
return df_summary

def get_column_index(self, name: str) -> int:
"""
Find the index of a column by name.
Expand Down
Loading

0 comments on commit 06c0190

Please sign in to comment.