From c759caa6eb1188b453ac558be8055ca0d668bc06 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 24 Jan 2024 08:01:25 +0000 Subject: [PATCH] refactor(python): minor `describe` tidy-up, and minor rewording of some Exception docstrings --- py-polars/polars/dataframe/frame.py | 58 +++++++++++++++-------------- py-polars/polars/exceptions.py | 18 ++++----- 2 files changed, 40 insertions(+), 36 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index b682ff006019..f03de8b79b4f 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -4459,60 +4459,64 @@ def describe( @lru_cache def skip_minmax(dt: PolarsDataType) -> bool: - return dt.is_nested() or dt in (Object, Null, Unknown, Categorical, Enum) + return dt.is_nested() or dt in (Categorical, Enum, Null, Object, Unknown) - # determine which columns get std/mean/percentile stats + # determine which columns will produce std/mean/percentile/etc + # statistics in a single pass over the frame schema has_numeric_result, sort_cols = set(), set() - metric_exprs = [] + metric_exprs: list[Expr] = [] null = F.lit(None) - for c, dt in self.schema.items(): - is_numeric = dt.is_numeric() - is_temporal = not is_numeric and dt.is_temporal() + for c, dtype in self.schema.items(): + is_numeric = dtype.is_numeric() + is_temporal = not is_numeric and dtype.is_temporal() # counts count_exprs = [ F.col(c).count().name.prefix("count:"), F.col(c).null_count().name.prefix("null_count:"), ] - metric_exprs.extend(count_exprs) - # mean - if is_temporal: - mean_expr = F.col(c).to_physical().mean().cast(dt) - else: - mean_expr = F.col(c).mean() if is_numeric or dt == Boolean else null - metric_exprs.append(mean_expr.alias(f"mean:{c}")) + mean_expr = ( + F.col(c).to_physical().mean().cast(dtype) + if is_temporal + else (F.col(c).mean() if is_numeric or dtype == Boolean else null) + ) - # standard deviation + # standard deviation, min, max expr_std = F.col(c).std() if is_numeric else null - metric_exprs.append(expr_std.alias(f"std:{c}")) - - # min - min_expr = F.col(c).min() if not skip_minmax(dt) else null - metric_exprs.append(min_expr.alias(f"min:{c}")) + min_expr = F.col(c).min() if not skip_minmax(dtype) else null + max_expr = F.col(c).max() if not skip_minmax(dtype) else null # percentiles + pct_exprs = [] for p in quantiles: if is_numeric or is_temporal: pct_expr = ( - F.col(c).to_physical().quantile(p, interpolation).cast(dt) + F.col(c).to_physical().quantile(p, interpolation).cast(dtype) if is_temporal else F.col(c).quantile(p, interpolation) ) sort_cols.add(c) else: pct_expr = null - metric_exprs.append(pct_expr.alias(f"{p}:{c}")) - - # max - metric_exprs.append( - (F.col(c).max() if not skip_minmax(dt) else null).alias(f"max:{c}") - ) + pct_exprs.append(pct_expr.alias(f"{p}:{c}")) - if is_numeric or dt.is_nested() or dt in (Null, Boolean): + if is_numeric or dtype.is_nested() or dtype in (Null, Boolean): has_numeric_result.add(c) + # add column expressions (in end-state 'metrics' list order) + metric_exprs.extend( + [ + *count_exprs, + mean_expr.alias(f"mean:{c}"), + expr_std.alias(f"std:{c}"), + min_expr.alias(f"min:{c}"), + *pct_exprs, + max_expr.alias(f"max:{c}"), + ] + ) + # if more than one quantile requested, sort relevant columns to make them O(1) # TODO: remove once we have engine support for retrieving multiples quantiles lf = ( diff --git a/py-polars/polars/exceptions.py b/py-polars/polars/exceptions.py index 386f1a1758d0..608e1e07e6e5 100644 --- a/py-polars/polars/exceptions.py +++ b/py-polars/polars/exceptions.py @@ -26,16 +26,16 @@ class ColumnNotFoundError(PolarsError): # type: ignore[no-redef, misc] """Exception raised when a specified column is not found.""" class ComputeError(PolarsError): # type: ignore[no-redef, misc] - """Exception raised when polars could not finish the computation.""" + """Exception raised when Polars could not perform an underlying computation.""" class DuplicateError(PolarsError): # type: ignore[no-redef, misc] """Exception raised when a column name is duplicated.""" class InvalidOperationError(PolarsError): # type: ignore[no-redef, misc] - """Exception raised when an operation is not allowed on a certain data type.""" + """Exception raised when an operation is not allowed (or possible) against a given object or data structure.""" # noqa: W505 class NoDataError(PolarsError): # type: ignore[no-redef, misc] - """Exception raised when an operation can not be performed on an empty data structure.""" # noqa: W505 + """Exception raised when an operation cannot be performed on an empty data structure.""" # noqa: W505 class OutOfBoundsError(PolarsError): # type: ignore[no-redef, misc] """Exception raised when the given index is out of bounds.""" @@ -44,19 +44,19 @@ class PolarsPanicError(PolarsError): # type: ignore[no-redef, misc] """Exception raised when an unexpected state causes a panic in the underlying Rust library.""" # noqa: W505 class SchemaError(PolarsError): # type: ignore[no-redef, misc] - """Exception raised when trying to combine data structures with mismatched schemas.""" # noqa: W505 + """Exception raised when an unexpected schema mismatch causes an error.""" class SchemaFieldNotFoundError(PolarsError): # type: ignore[no-redef, misc] """Exception raised when a specified schema field is not found.""" class ShapeError(PolarsError): # type: ignore[no-redef, misc] - """Exception raised when trying to combine data structures with incompatible shapes.""" # noqa: W505 + """Exception raised when trying to perform operations on data structures with incompatible shapes.""" # noqa: W505 class StringCacheMismatchError(PolarsError): # type: ignore[no-redef, misc] """Exception raised when string caches come from different sources.""" class StructFieldNotFoundError(PolarsError): # type: ignore[no-redef, misc] - """Exception raised when a specified schema field is not found.""" + """Exception raised when a specified Struct field is not found.""" class PolarsWarning(Exception): # type: ignore[no-redef] """Base class for all Polars warnings.""" @@ -82,7 +82,7 @@ class TooManyRowsReturnedError(RowsError): class ModuleUpgradeRequired(ModuleNotFoundError): - """Exception raised when the module is installed but needs to be upgraded.""" + """Exception raised when a module is installed but needs to be upgraded.""" class ParameterCollisionError(PolarsError): # type: ignore[misc] @@ -106,7 +106,7 @@ class ChronoFormatWarning(PolarsWarning): # type: ignore[misc] class PolarsInefficientMapWarning(PolarsWarning): # type: ignore[misc] - """Warning raised when a potentially slow `apply` operation is performed.""" + """Warning raised when a potentially slow `map_*` operation is performed.""" class TimeZoneAwareConstructorWarning(PolarsWarning): # type: ignore[misc] @@ -114,7 +114,7 @@ class TimeZoneAwareConstructorWarning(PolarsWarning): # type: ignore[misc] class ArrowError(Exception): - """deprecated will be removed.""" + """Deprecated: will be removed.""" __all__ = [