From db9a0bba500cdd0dcfe726383b4a76278b4095ec Mon Sep 17 00:00:00 2001 From: taki Date: Thu, 18 Jan 2024 23:11:02 +0100 Subject: [PATCH] perf: optimize `DataFrame.describe` by presorting columns By presorting numerical columns, quantiles/min/max will be O(1) --- py-polars/polars/dataframe/frame.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 1812b073bbe4..37e3f9671565 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -4419,7 +4419,9 @@ def describe( # Determine metrics and optional/additional percentiles metrics = ["count", "null_count", "mean", "std", "min"] percentile_exprs = [] - for p in parse_percentiles(percentiles): + + percentiles = parse_percentiles(percentiles) + for p in percentiles: for c in self.columns: expr = F.col(c).quantile(p) if c in stat_cols else F.lit(None) expr = expr.alias(f"{p}:{c}") @@ -4451,8 +4453,16 @@ def describe( for c in self.columns ] + # If more than one quantile is requested, + # sort numerical columns to make them O(1). + # TODO: Should be removed once Polars supports + # getting multiples quantiles at once. + sort_exprs = [ + (F.col(c).sort() if len(percentiles) > 1 and c in stat_cols else F.col(c)) + for c in self.columns + ] # Calculate metrics in parallel - df_metrics = self.select( + df_metrics = self.select(*sort_exprs).select( F.all().count().name.prefix("count:"), F.all().null_count().name.prefix("null_count:"), *mean_exprs,