Skip to content

Commit

Permalink
rebase/integrate quantile sorting
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie committed Jan 23, 2024
1 parent bbb8911 commit b1ae70e
Showing 1 changed file with 17 additions and 9 deletions.
26 changes: 17 additions & 9 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4462,7 +4462,7 @@ def skip_minmax(dt: PolarsDataType) -> bool:
return dt.is_nested() or dt in (Object, Null, Unknown, Categorical, Enum)

# determine which columns get std/mean/percentile stats
numeric_result = set()
has_numeric_result, sort_cols = set(), set()
metric_exprs = []
null = F.lit(None)

Expand Down Expand Up @@ -4494,15 +4494,15 @@ def skip_minmax(dt: PolarsDataType) -> bool:

# percentiles
for p in quantiles:
pct_expr = (
(
if is_numeric or is_temporal:
pct_expr = (
F.col(c).to_physical().quantile(p, interpolation).cast(dt)
if is_temporal
else F.col(c).quantile(p, interpolation)
)
if (is_numeric or is_temporal)
else null
)
sort_cols.add(c)
else:
pct_expr = null
metric_exprs.append(pct_expr.alias(f"{p}:{c}"))

# max
Expand All @@ -4511,10 +4511,18 @@ def skip_minmax(dt: PolarsDataType) -> bool:
)

if is_numeric or dt.is_nested() or dt in (Null, Boolean):
numeric_result.add(c)
has_numeric_result.add(c)

# if more than one quantile requested, sort relevant columns to make them O(1)
# TODO: remove once we have engine support for retrieving multiples quantiles
lf = (
self.lazy().with_columns(F.col(c).sort() for c in sort_cols)
if sort_cols
else self.lazy()
)

# calculate metrics in parallel
df_metrics = self.select(*metric_exprs)
df_metrics = lf.select(*metric_exprs).collect()

# reshape wide result
n_metrics = len(metrics)
Expand All @@ -4529,7 +4537,7 @@ def skip_minmax(dt: PolarsDataType) -> bool:
summary[c] = [ # type: ignore[assignment]
None
if (v is None or isinstance(v, dict))
else (float(v) if (c in numeric_result) else str(v))
else (float(v) if (c in has_numeric_result) else str(v))
for v in summary[c]
]

Expand Down

0 comments on commit b1ae70e

Please sign in to comment.