Skip to content

Commit

Permalink
feat(python): Add PerformanceWarning to LazyFrame properties (pola-…
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored and Wouittone committed Jun 22, 2024
1 parent f7f76c0 commit 020c075
Show file tree
Hide file tree
Showing 12 changed files with 151 additions and 47 deletions.
5 changes: 2 additions & 3 deletions py-polars/polars/_utils/deprecation.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from __future__ import annotations

import inspect
import warnings
from functools import wraps
from typing import TYPE_CHECKING, Callable, Sequence, TypeVar

from polars._utils.various import find_stacklevel
from polars._utils.various import issue_warning

if TYPE_CHECKING:
import sys
Expand Down Expand Up @@ -41,7 +40,7 @@ def issue_deprecation_warning(message: str, *, version: str) -> None:
This argument is used to help developers determine when to remove the
deprecated functionality.
"""
warnings.warn(message, DeprecationWarning, stacklevel=find_stacklevel())
issue_warning(message, DeprecationWarning)


def deprecate_function(
Expand Down
5 changes: 2 additions & 3 deletions py-polars/polars/_utils/unstable.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@

import inspect
import os
import warnings
from functools import wraps
from typing import TYPE_CHECKING, Callable, TypeVar

from polars._utils.various import find_stacklevel
from polars._utils.various import issue_warning
from polars.exceptions import UnstableWarning

if TYPE_CHECKING:
Expand Down Expand Up @@ -46,7 +45,7 @@ def issue_unstable_warning(message: str | None = None) -> None:
" It may be changed at any point without it being considered a breaking change."
)

warnings.warn(message, UnstableWarning, stacklevel=find_stacklevel())
issue_warning(message, UnstableWarning)


def unstable() -> Callable[[Callable[P, T]], Callable[P, T]]:
Expand Down
19 changes: 19 additions & 0 deletions py-polars/polars/_utils/various.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,25 @@ def find_stacklevel() -> int:
return n


def issue_warning(message: str, category: type[Warning], **kwargs: Any) -> None:
"""
Issue a warning.
Parameters
----------
message
The message associated with the warning.
category
The warning category.
**kwargs
Additional arguments for `warnings.warn`. Note that the `stacklevel` is
determined automatically.
"""
warnings.warn(
message=message, category=category, stacklevel=find_stacklevel(), **kwargs
)


def _get_stack_locals(
of_type: type | Collection[type] | Callable[[Any], bool] | None = None,
*,
Expand Down
34 changes: 22 additions & 12 deletions py-polars/polars/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
MapWithoutReturnDtypeWarning,
NoDataError,
OutOfBoundsError,
PerformanceWarning,
PolarsError,
PolarsPanicError,
PolarsWarning,
Expand Down Expand Up @@ -94,11 +95,14 @@ class StructFieldNotFoundError(PolarsError): # type: ignore[no-redef, misc]
class PolarsWarning(Exception): # type: ignore[no-redef]
"""Base class for all Polars warnings."""

class CategoricalRemappingWarning(PolarsWarning): # type: ignore[no-redef, misc]
"""Warning raised when a categorical needs to be remapped to be compatible with another categorical.""" # noqa: W505
class PerformanceWarning(PolarsWarning): # type: ignore[no-redef, misc]
"""Warning issued to indicate potential performance pitfalls."""

class CategoricalRemappingWarning(PerformanceWarning): # type: ignore[no-redef, misc]
"""Warning issued when a categorical needs to be remapped to be compatible with another categorical.""" # noqa: W505

class MapWithoutReturnDtypeWarning(PolarsWarning): # type: ignore[no-redef, misc]
"""Warning raised when `map_elements` is performed without specifying the return dtype.""" # noqa: W505
"""Warning issued when `map_elements` is performed without specifying the return dtype.""" # noqa: W505


class InvalidAssert(PolarsError): # type: ignore[misc]
Expand Down Expand Up @@ -141,34 +145,31 @@ class ChronoFormatWarning(PolarsWarning): # type: ignore[misc]
"""


class PolarsInefficientMapWarning(PolarsWarning): # type: ignore[misc]
class CustomUFuncWarning(PolarsWarning): # type: ignore[misc]
"""Warning issued when a custom ufunc is handled differently than numpy ufunc would.""" # noqa: W505


class PolarsInefficientMapWarning(PerformanceWarning): # type: ignore[misc]
"""Warning issued when a potentially slow `map_*` operation is performed."""


class UnstableWarning(PolarsWarning): # type: ignore[misc]
"""Warning issued when unstable functionality is used."""


class CustomUFuncWarning(PolarsWarning): # type: ignore[misc]
"""Warning issued when a custom ufunc is handled differently than numpy ufunc would.""" # noqa: W505


__all__ = [
"CategoricalRemappingWarning",
# Errors
"ChronoFormatWarning",
"ColumnNotFoundError",
"ComputeError",
"DuplicateError",
"InvalidOperationError",
"MapWithoutReturnDtypeWarning",
"ModuleUpgradeRequired",
"NoDataError",
"NoRowsReturnedError",
"OutOfBoundsError",
"PolarsError",
"PolarsInefficientMapWarning",
"PolarsPanicError",
"PolarsWarning",
"RowsError",
"SQLInterfaceError",
"SQLSyntaxError",
Expand All @@ -178,4 +179,13 @@ class CustomUFuncWarning(PolarsWarning): # type: ignore[misc]
"StringCacheMismatchError",
"StructFieldNotFoundError",
"TooManyRowsReturnedError",
# Warnings
"PolarsWarning",
"CategoricalRemappingWarning",
"ChronoFormatWarning",
"CustomUFuncWarning",
"MapWithoutReturnDtypeWarning",
"PerformanceWarning",
"PolarsInefficientMapWarning",
"UnstableWarning",
]
55 changes: 44 additions & 11 deletions py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
extend_bool,
is_bool_sequence,
is_sequence,
issue_warning,
normalize_filepath,
parse_percentiles,
)
Expand Down Expand Up @@ -74,6 +75,7 @@
py_type_to_dtype,
)
from polars.dependencies import import_optional, subprocess
from polars.exceptions import PerformanceWarning
from polars.lazyframe.group_by import LazyGroupBy
from polars.lazyframe.in_process import InProcessQuery
from polars.schema import Schema
Expand Down Expand Up @@ -399,13 +401,15 @@ def columns(self) -> list[str]:
Warnings
--------
Determining the column names of a LazyFrame requires resolving its schema.
Resolving the schema of a LazyFrame can be an expensive operation.
Avoid accessing this property repeatedly if possible.
Determining the column names of a LazyFrame requires resolving its schema,
which is a potentially expensive operation.
Using :meth:`collect_schema` is the idiomatic way of resolving the schema.
This property exists only for symmetry with the DataFrame class.
See Also
--------
collect_schema
Schema.names
Examples
--------
Expand All @@ -419,6 +423,12 @@ def columns(self) -> list[str]:
>>> lf.columns
['foo', 'bar']
"""
issue_warning(
"Determining the column names of a LazyFrame requires resolving its schema,"
" which is a potentially expensive operation. Use `LazyFrame.collect_schema().names()`"
" to get the column names without this warning.",
category=PerformanceWarning,
)
return self.collect_schema().names()

@property
Expand All @@ -433,13 +443,15 @@ def dtypes(self) -> list[DataType]:
Warnings
--------
Determining the data types of a LazyFrame requires resolving its schema.
Resolving the schema of a LazyFrame can be an expensive operation.
Avoid accessing this property repeatedly if possible.
Determining the data types of a LazyFrame requires resolving its schema,
which is a potentially expensive operation.
Using :meth:`collect_schema` is the idiomatic way to resolve the schema.
This property exists only for symmetry with the DataFrame class.
See Also
--------
collect_schema
Schema.dtypes
Examples
--------
Expand All @@ -453,6 +465,12 @@ def dtypes(self) -> list[DataType]:
>>> lf.dtypes
[Int64, Float64, String]
"""
issue_warning(
"Determining the data types of a LazyFrame requires resolving its schema,"
" which is a potentially expensive operation. Use `LazyFrame.collect_schema().dtypes()`"
" to get the data types without this warning.",
category=PerformanceWarning,
)
return self.collect_schema().dtypes()

@property
Expand All @@ -462,12 +480,14 @@ def schema(self) -> Schema:
Warnings
--------
Resolving the schema of a LazyFrame can be an expensive operation.
Avoid accessing this property repeatedly if possible.
Resolving the schema of a LazyFrame is a potentially expensive operation.
Using :meth:`collect_schema` is the idiomatic way to resolve the schema.
This property exists only for symmetry with the DataFrame class.
See Also
--------
collect_schema
Schema
Examples
--------
Expand All @@ -481,6 +501,11 @@ def schema(self) -> Schema:
>>> lf.schema
Schema({'foo': Int64, 'bar': Float64, 'ham': String})
"""
issue_warning(
"Resolving the schema of a LazyFrame is a potentially expensive operation."
" Use `LazyFrame.collect_schema()` to get the schema without this warning.",
category=PerformanceWarning,
)
return self.collect_schema()

@property
Expand All @@ -494,13 +519,15 @@ def width(self) -> int:
Warnings
--------
Determining the width of a LazyFrame requires resolving its schema.
Resolving the schema of a LazyFrame can be an expensive operation.
Avoid accessing this property repeatedly if possible.
Determining the width of a LazyFrame requires resolving its schema,
which is a potentially expensive operation.
Using :meth:`collect_schema` is the idiomatic way to resolve the schema.
This property exists only for symmetry with the DataFrame class.
See Also
--------
collect_schema
Schema.len
Examples
--------
Expand All @@ -513,6 +540,12 @@ def width(self) -> int:
>>> lf.width
2
"""
issue_warning(
"Determining the width of a LazyFrame requires resolving its schema,"
" which is a potentially expensive operation. Use `LazyFrame.collect_schema().len()`"
" to get the width without this warning.",
category=PerformanceWarning,
)
return self.collect_schema().len()

def __bool__(self) -> NoReturn:
Expand Down
3 changes: 2 additions & 1 deletion py-polars/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,11 @@ create_exception!(polars.exceptions, StringCacheMismatchError, PolarsBaseError);
create_exception!(polars.exceptions, StructFieldNotFoundError, PolarsBaseError);

create_exception!(polars.exceptions, PolarsBaseWarning, PyWarning);
create_exception!(polars.exceptions, PerformanceWarning, PolarsBaseWarning);
create_exception!(
polars.exceptions,
CategoricalRemappingWarning,
PolarsBaseWarning
PerformanceWarning
);
create_exception!(
polars.exceptions,
Expand Down
9 changes: 7 additions & 2 deletions py-polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ use crate::dataframe::PyDataFrame;
use crate::error::{
CategoricalRemappingWarning, ColumnNotFoundError, ComputeError, DuplicateError,
InvalidOperationError, MapWithoutReturnDtypeWarning, NoDataError, OutOfBoundsError,
PolarsBaseError, PolarsBaseWarning, PyPolarsErr, SQLInterfaceError, SQLSyntaxError,
SchemaError, SchemaFieldNotFoundError, StructFieldNotFoundError,
PerformanceWarning, PolarsBaseError, PolarsBaseWarning, PyPolarsErr, SQLInterfaceError,
SQLSyntaxError, SchemaError, SchemaFieldNotFoundError, StructFieldNotFoundError,
};
use crate::expr::PyExpr;
use crate::functions::PyStringCacheHolder;
Expand Down Expand Up @@ -366,6 +366,11 @@ fn polars(py: Python, m: &Bound<PyModule>) -> PyResult<()> {
// Exceptions - Warnings
m.add("PolarsWarning", py.get_type_bound::<PolarsBaseWarning>())
.unwrap();
m.add(
"PerformanceWarning",
py.get_type_bound::<PerformanceWarning>(),
)
.unwrap();
m.add(
"CategoricalRemappingWarning",
py.get_type_bound::<CategoricalRemappingWarning>(),
Expand Down
6 changes: 3 additions & 3 deletions py-polars/tests/unit/io/cloud/test_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,12 @@ def test_read_s3(s3: str, function: Callable[..., Any], extension: str) -> None:
[(pl.scan_ipc, "ipc"), (pl.scan_parquet, "parquet")],
)
def test_scan_s3(s3: str, function: Callable[..., Any], extension: str) -> None:
df = function(
lf = function(
f"s3://bucket/foods1.{extension}",
storage_options={"endpoint_url": s3},
)
assert df.columns == ["category", "calories", "fats_g", "sugars_g"]
assert df.collect().shape == (27, 4)
assert lf.collect_schema().names() == ["category", "calories", "fats_g", "sugars_g"]
assert lf.collect().shape == (27, 4)


def test_lazy_count_s3(s3: str) -> None:
Expand Down
14 changes: 9 additions & 5 deletions py-polars/tests/unit/lazyframe/test_lazyframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import polars.selectors as cs
from polars import lit, when
from polars.datatypes import FLOAT_DTYPES
from polars.exceptions import PolarsInefficientMapWarning
from polars.exceptions import PerformanceWarning, PolarsInefficientMapWarning
from polars.testing import assert_frame_equal, assert_series_equal

if TYPE_CHECKING:
Expand Down Expand Up @@ -1382,7 +1382,11 @@ def test_lf_properties() -> None:
"ham": ["a", "b", "c"],
}
)
assert lf.schema == {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String}
assert lf.columns == ["foo", "bar", "ham"]
assert lf.dtypes == [pl.Int64, pl.Float64, pl.String]
assert lf.width == 3
with pytest.warns(PerformanceWarning):
assert lf.schema == {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String}
with pytest.warns(PerformanceWarning):
assert lf.columns == ["foo", "bar", "ham"]
with pytest.warns(PerformanceWarning):
assert lf.dtypes == [pl.Int64, pl.Float64, pl.String]
with pytest.warns(PerformanceWarning):
assert lf.width == 3
4 changes: 2 additions & 2 deletions py-polars/tests/unit/operations/arithmetic/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@ def test_arithmetic_duration_div_multiply() -> None:
e=pl.col("a") * 2.5,
f=pl.col("a") / pl.col("a"), # a constant float
)
assert q.schema == pl.Schema(
assert q.collect_schema() == pl.Schema(
[
("a", pl.Duration(time_unit="us")),
("b", pl.Duration(time_unit="us")),
Expand Down Expand Up @@ -715,7 +715,7 @@ def test_arithmetic_duration_div_multiply() -> None:
b=2 * pl.col("a"),
c=2.5 * pl.col("a"),
)
assert q.schema == pl.Schema(
assert q.collect_schema() == pl.Schema(
[
("a", pl.Duration(time_unit="us")),
("b", pl.Duration(time_unit="us")),
Expand Down
Loading

0 comments on commit 020c075

Please sign in to comment.