Skip to content

Commit

Permalink
perf: optimize DataFrame.describe by presorting columns
Browse files Browse the repository at this point in the history
By presorting numerical columns, quantiles/min/max will be O(1)
  • Loading branch information
taki committed Jan 19, 2024
1 parent 36d0e94 commit db9a0bb
Showing 1 changed file with 12 additions and 2 deletions.
14 changes: 12 additions & 2 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4419,7 +4419,9 @@ def describe(
# Determine metrics and optional/additional percentiles
metrics = ["count", "null_count", "mean", "std", "min"]
percentile_exprs = []
for p in parse_percentiles(percentiles):

percentiles = parse_percentiles(percentiles)
for p in percentiles:
for c in self.columns:
expr = F.col(c).quantile(p) if c in stat_cols else F.lit(None)
expr = expr.alias(f"{p}:{c}")
Expand Down Expand Up @@ -4451,8 +4453,16 @@ def describe(
for c in self.columns
]

# If more than one quantile is requested,
# sort numerical columns to make them O(1).
# TODO: Should be removed once Polars supports
# getting multiples quantiles at once.
sort_exprs = [
(F.col(c).sort() if len(percentiles) > 1 and c in stat_cols else F.col(c))
for c in self.columns
]
# Calculate metrics in parallel
df_metrics = self.select(
df_metrics = self.select(*sort_exprs).select(
F.all().count().name.prefix("count:"),
F.all().null_count().name.prefix("null_count:"),
*mean_exprs,
Expand Down

0 comments on commit db9a0bb

Please sign in to comment.