Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(python): minor describe tidy-up, and slight rewording of some Exception docstrings #13942

Merged
merged 1 commit into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 31 additions & 27 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4459,60 +4459,64 @@ def describe(

@lru_cache
def skip_minmax(dt: PolarsDataType) -> bool:
return dt.is_nested() or dt in (Object, Null, Unknown, Categorical, Enum)
return dt.is_nested() or dt in (Categorical, Enum, Null, Object, Unknown)

# determine which columns get std/mean/percentile stats
# determine which columns will produce std/mean/percentile/etc
# statistics in a single pass over the frame schema
has_numeric_result, sort_cols = set(), set()
metric_exprs = []
metric_exprs: list[Expr] = []
null = F.lit(None)

for c, dt in self.schema.items():
is_numeric = dt.is_numeric()
is_temporal = not is_numeric and dt.is_temporal()
for c, dtype in self.schema.items():
is_numeric = dtype.is_numeric()
is_temporal = not is_numeric and dtype.is_temporal()

# counts
count_exprs = [
F.col(c).count().name.prefix("count:"),
F.col(c).null_count().name.prefix("null_count:"),
]
metric_exprs.extend(count_exprs)

# mean
if is_temporal:
mean_expr = F.col(c).to_physical().mean().cast(dt)
else:
mean_expr = F.col(c).mean() if is_numeric or dt == Boolean else null
metric_exprs.append(mean_expr.alias(f"mean:{c}"))
mean_expr = (
F.col(c).to_physical().mean().cast(dtype)
if is_temporal
else (F.col(c).mean() if is_numeric or dtype == Boolean else null)
)

# standard deviation
# standard deviation, min, max
expr_std = F.col(c).std() if is_numeric else null
metric_exprs.append(expr_std.alias(f"std:{c}"))

# min
min_expr = F.col(c).min() if not skip_minmax(dt) else null
metric_exprs.append(min_expr.alias(f"min:{c}"))
min_expr = F.col(c).min() if not skip_minmax(dtype) else null
max_expr = F.col(c).max() if not skip_minmax(dtype) else null

# percentiles
pct_exprs = []
for p in quantiles:
if is_numeric or is_temporal:
pct_expr = (
F.col(c).to_physical().quantile(p, interpolation).cast(dt)
F.col(c).to_physical().quantile(p, interpolation).cast(dtype)
if is_temporal
else F.col(c).quantile(p, interpolation)
)
sort_cols.add(c)
else:
pct_expr = null
metric_exprs.append(pct_expr.alias(f"{p}:{c}"))

# max
metric_exprs.append(
(F.col(c).max() if not skip_minmax(dt) else null).alias(f"max:{c}")
)
pct_exprs.append(pct_expr.alias(f"{p}:{c}"))

if is_numeric or dt.is_nested() or dt in (Null, Boolean):
if is_numeric or dtype.is_nested() or dtype in (Null, Boolean):
has_numeric_result.add(c)

# add column expressions (in end-state 'metrics' list order)
metric_exprs.extend(
[
*count_exprs,
mean_expr.alias(f"mean:{c}"),
expr_std.alias(f"std:{c}"),
min_expr.alias(f"min:{c}"),
*pct_exprs,
max_expr.alias(f"max:{c}"),
]
)

# if more than one quantile requested, sort relevant columns to make them O(1)
# TODO: remove once we have engine support for retrieving multiples quantiles
lf = (
Expand Down
18 changes: 9 additions & 9 deletions py-polars/polars/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,16 @@ class ColumnNotFoundError(PolarsError): # type: ignore[no-redef, misc]
"""Exception raised when a specified column is not found."""

class ComputeError(PolarsError): # type: ignore[no-redef, misc]
"""Exception raised when polars could not finish the computation."""
"""Exception raised when Polars could not perform an underlying computation."""

class DuplicateError(PolarsError): # type: ignore[no-redef, misc]
"""Exception raised when a column name is duplicated."""

class InvalidOperationError(PolarsError): # type: ignore[no-redef, misc]
"""Exception raised when an operation is not allowed on a certain data type."""
"""Exception raised when an operation is not allowed (or possible) against a given object or data structure.""" # noqa: W505

class NoDataError(PolarsError): # type: ignore[no-redef, misc]
"""Exception raised when an operation can not be performed on an empty data structure.""" # noqa: W505
"""Exception raised when an operation cannot be performed on an empty data structure.""" # noqa: W505

class OutOfBoundsError(PolarsError): # type: ignore[no-redef, misc]
"""Exception raised when the given index is out of bounds."""
Expand All @@ -44,19 +44,19 @@ class PolarsPanicError(PolarsError): # type: ignore[no-redef, misc]
"""Exception raised when an unexpected state causes a panic in the underlying Rust library.""" # noqa: W505

class SchemaError(PolarsError): # type: ignore[no-redef, misc]
"""Exception raised when trying to combine data structures with mismatched schemas.""" # noqa: W505
"""Exception raised when an unexpected schema mismatch causes an error."""

class SchemaFieldNotFoundError(PolarsError): # type: ignore[no-redef, misc]
"""Exception raised when a specified schema field is not found."""

class ShapeError(PolarsError): # type: ignore[no-redef, misc]
"""Exception raised when trying to combine data structures with incompatible shapes.""" # noqa: W505
"""Exception raised when trying to perform operations on data structures with incompatible shapes.""" # noqa: W505

class StringCacheMismatchError(PolarsError): # type: ignore[no-redef, misc]
"""Exception raised when string caches come from different sources."""

class StructFieldNotFoundError(PolarsError): # type: ignore[no-redef, misc]
"""Exception raised when a specified schema field is not found."""
"""Exception raised when a specified Struct field is not found."""

class PolarsWarning(Exception): # type: ignore[no-redef]
"""Base class for all Polars warnings."""
Expand All @@ -82,7 +82,7 @@ class TooManyRowsReturnedError(RowsError):


class ModuleUpgradeRequired(ModuleNotFoundError):
"""Exception raised when the module is installed but needs to be upgraded."""
"""Exception raised when a module is installed but needs to be upgraded."""


class ParameterCollisionError(PolarsError): # type: ignore[misc]
Expand All @@ -106,15 +106,15 @@ class ChronoFormatWarning(PolarsWarning): # type: ignore[misc]


class PolarsInefficientMapWarning(PolarsWarning): # type: ignore[misc]
"""Warning raised when a potentially slow `apply` operation is performed."""
"""Warning raised when a potentially slow `map_*` operation is performed."""


class TimeZoneAwareConstructorWarning(PolarsWarning): # type: ignore[misc]
"""Warning raised when constructing Series from non-UTC time-zone-aware inputs."""


class ArrowError(Exception):
"""deprecated will be removed."""
"""Deprecated: will be removed."""


__all__ = [
Expand Down